| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200 |
- import numpy as np
- import pandas as pd
- import random
- from datetime import datetime, timedelta
- from sklearn.ensemble import IsolationForest
- def random_time():
- start = datetime(2020, 1, 1)
- end = datetime.now()
- delta = end - start
- random_seconds = random.randint(0, int(delta.total_seconds()))
- return (start + timedelta(seconds=random_seconds)).isoformat()
- def generate_good_log():
- log = {
- "DeletedFiles": random.randint(0, 5),
- "DeletedFolders": random.randint(0, 2),
- "ModifiedFiles": random.randint(0, 50),
- "ExaminedFiles": random.randint(500, 1000),
- "OpenedFiles": random.randint(0, 50),
- "AddedFiles": random.randint(0, 10),
- "SizeOfModifiedFiles": random.randint(1000000, 2000000000),
- "SizeOfAddedFiles": random.randint(0, 10000000),
- "SizeOfExaminedFiles": random.randint(1000000000, 8000000000),
- "SizeOfOpenedFiles": random.randint(0, 2000000000),
- "NotProcessedFiles": 0,
- "AddedFolders": random.randint(0, 2),
- "TooLargeFiles": 0,
- "FilesWithError": 0,
- "ModifiedFolders": random.randint(0, 2),
- "ModifiedSymlinks": 0,
- "AddedSymlinks": 0,
- "DeletedSymlinks": 0,
- "PartialBackup": False,
- "Dryrun": False,
- "MainOperation": "Backup",
- "ParsedResult": "Success",
- "Interrupted": False,
- "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
- "EndTime": random_time(),
- "BeginTime": random_time(),
- "Duration": "00:00:{:02d}.{:07d}".format(random.randint(10, 59), random.randint(0, 9999999)),
- "MessagesActualLength": random.randint(10, 30),
- "WarningsActualLength": random.randint(0, 1),
- "ErrorsActualLength": 0,
- "Messages": [],
- "Warnings": [],
- "Errors": [],
- "BackendStatistics": {
- "RemoteCalls": random.randint(5, 15),
- "BytesUploaded": random.randint(1000000, 100000000),
- "BytesDownloaded": random.randint(1000000, 100000000),
- "FilesUploaded": random.randint(1, 10),
- "FilesDownloaded": random.randint(1, 10),
- "FilesDeleted": random.randint(0, 5),
- "FoldersCreated": random.randint(0, 2),
- "RetryAttempts": 0,
- "UnknownFileSize": 0,
- "UnknownFileCount": 0,
- "KnownFileCount": random.randint(100, 1000),
- "KnownFileSize": random.randint(1000000000, 8000000000),
- "LastBackupDate": random_time(),
- "BackupListCount": random.randint(5, 20),
- "TotalQuotaSpace": 1932937191424,
- "FreeQuotaSpace": random.randint(400000000000, 800000000000),
- "AssignedQuotaSpace": -1,
- "ReportedQuotaError": False,
- "ReportedQuotaWarning": False,
- "MainOperation": "Backup",
- "ParsedResult": "Success",
- "Interrupted": False,
- "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
- "EndTime": "0001-01-01T00:00:00",
- "BeginTime": random_time(),
- "Duration": "00:00:00",
- "MessagesActualLength": 0,
- "WarningsActualLength": 0,
- "ErrorsActualLength": 0,
- "Messages": None,
- "Warnings": None,
- "Errors": None
- }
- }
- return log
- def generate_bad_log():
- errors = ["Error: Disk full", "Error: Permission denied", "Error: Network timeout"]
- warnings = ["Warning: Slow upload", "Warning: File skipped", "Warning: Retry exceeded"]
- log = {
- "DeletedFiles": random.randint(100, 10000), # excessive deletions
- "DeletedFolders": random.randint(10, 100),
- "ModifiedFiles": random.randint(0, 10),
- "ExaminedFiles": random.randint(500, 1000),
- "OpenedFiles": random.randint(0, 10),
- "AddedFiles": random.randint(0, 2),
- "SizeOfModifiedFiles": random.randint(1000000, 200000000),
- "SizeOfAddedFiles": random.randint(0, 1000000),
- "SizeOfExaminedFiles": random.randint(1000000000, 8000000000),
- "SizeOfOpenedFiles": random.randint(0, 200000000),
- "NotProcessedFiles": random.randint(1, 10),
- "AddedFolders": random.randint(0, 2),
- "TooLargeFiles": random.randint(0, 2),
- "FilesWithError": random.randint(1, 5),
- "ModifiedFolders": random.randint(0, 2),
- "ModifiedSymlinks": 0,
- "AddedSymlinks": 0,
- "DeletedSymlinks": 0,
- "PartialBackup": True,
- "Dryrun": False,
- "MainOperation": "Backup",
- "ParsedResult": "Error",
- "Interrupted": True,
- "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
- "EndTime": random_time(),
- "BeginTime": random_time(),
- "Duration": "00:00:{:02d}.{:07d}".format(random.randint(10, 59), random.randint(0, 9999999)),
- "MessagesActualLength": random.randint(30, 100),
- "WarningsActualLength": random.randint(5, 20),
- "ErrorsActualLength": random.randint(1, 5),
- "Messages": [random.choice(warnings) for _ in range(random.randint(5, 20))],
- "Warnings": [random.choice(warnings) for _ in range(random.randint(5, 20))],
- "Errors": [random.choice(errors) for _ in range(random.randint(1, 5))],
- "BackendStatistics": {
- "RemoteCalls": random.randint(5, 15),
- "BytesUploaded": random.randint(100000, 1000000),
- "BytesDownloaded": random.randint(100000, 1000000),
- "FilesUploaded": random.randint(0, 2),
- "FilesDownloaded": random.randint(0, 2),
- "FilesDeleted": random.randint(100, 10000),
- "FoldersCreated": random.randint(0, 2),
- "RetryAttempts": random.randint(1, 10),
- "UnknownFileSize": random.randint(0, 1),
- "UnknownFileCount": random.randint(0, 1),
- "KnownFileCount": random.randint(100, 1000),
- "KnownFileSize": random.randint(1000000000, 8000000000),
- "LastBackupDate": random_time(),
- "BackupListCount": random.randint(5, 20),
- "TotalQuotaSpace": 1932937191424,
- "FreeQuotaSpace": random.randint(400000000000, 800000000000),
- "AssignedQuotaSpace": -1,
- "ReportedQuotaError": random.choice([True, False]),
- "ReportedQuotaWarning": random.choice([True, False]),
- "MainOperation": "Backup",
- "ParsedResult": "Error",
- "Interrupted": True,
- "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
- "EndTime": "0001-01-01T00:00:00",
- "BeginTime": random_time(),
- "Duration": "00:00:00",
- "MessagesActualLength": random.randint(5, 20),
- "WarningsActualLength": random.randint(5, 20),
- "ErrorsActualLength": random.randint(1, 5),
- "Messages": None,
- "Warnings": None,
- "Errors": None
- }
- }
- return log
- def flatten_features(log):
- """Extracts relevant features from top-level and BackendStatistics."""
- stats = log.get("BackendStatistics", {})
- return {
- "KnownFileCount": stats.get("KnownFileCount", 0),
- "KnownFileSize": stats.get("KnownFileSize", 0),
- "FilesUploaded": stats.get("FilesUploaded", 0),
- "FilesDownloaded": stats.get("FilesDownloaded", 0),
- "FilesDeleted": stats.get("FilesDeleted", 0),
- "BytesUploaded": stats.get("BytesUploaded", 0),
- "BytesDownloaded": stats.get("BytesDownloaded", 0),
- "Duration_sec": random.uniform(30, 600), # You can parse Duration if needed
- "ErrorsActualLength": log.get("ErrorsActualLength", 0),
- "WarningsActualLength": log.get("WarningsActualLength", 0),
- "label": 1 if log.get("ParsedResult") == "Error" or log.get("Interrupted") else 0
- }
- def generate_sample_data(n_good=100, n_bad=10, random_state=42):
- np.random.seed(random_state)
- random.seed(random_state)
- good_logs = [generate_good_log() for _ in range(n_good)]
- bad_logs = [generate_bad_log() for _ in range(n_bad)]
- all_logs = good_logs + bad_logs
- features = [flatten_features(log) for log in all_logs]
- df = pd.DataFrame(features)
- return df
- def try_isolation_forest(df):
- features = [
- "KnownFileCount", "KnownFileSize", "FilesUploaded", "FilesDownloaded",
- "FilesDeleted", "BytesUploaded", "BytesDownloaded",
- "Duration_sec", "ErrorsActualLength", "WarningsActualLength"
- ]
- clf = IsolationForest(random_state=42)
- preds = clf.fit_predict(df[features])
- df["predicted_anomaly"] = (preds == -1).astype(int)
- print(df[features + ["label", "predicted_anomaly"]])
- if __name__ == "__main__":
- df = generate_sample_data()
- try_isolation_forest(df)
|