import numpy as np import pandas as pd import random from datetime import datetime, timedelta from sklearn.ensemble import IsolationForest def random_time(): start = datetime(2020, 1, 1) end = datetime.now() delta = end - start random_seconds = random.randint(0, int(delta.total_seconds())) return (start + timedelta(seconds=random_seconds)).isoformat() def generate_good_log(): log = { "DeletedFiles": random.randint(0, 5), "DeletedFolders": random.randint(0, 2), "ModifiedFiles": random.randint(0, 50), "ExaminedFiles": random.randint(500, 1000), "OpenedFiles": random.randint(0, 50), "AddedFiles": random.randint(0, 10), "SizeOfModifiedFiles": random.randint(1000000, 2000000000), "SizeOfAddedFiles": random.randint(0, 10000000), "SizeOfExaminedFiles": random.randint(1000000000, 8000000000), "SizeOfOpenedFiles": random.randint(0, 2000000000), "NotProcessedFiles": 0, "AddedFolders": random.randint(0, 2), "TooLargeFiles": 0, "FilesWithError": 0, "ModifiedFolders": random.randint(0, 2), "ModifiedSymlinks": 0, "AddedSymlinks": 0, "DeletedSymlinks": 0, "PartialBackup": False, "Dryrun": False, "MainOperation": "Backup", "ParsedResult": "Success", "Interrupted": False, "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)", "EndTime": random_time(), "BeginTime": random_time(), "Duration": "00:00:{:02d}.{:07d}".format(random.randint(10, 59), random.randint(0, 9999999)), "MessagesActualLength": random.randint(10, 30), "WarningsActualLength": random.randint(0, 1), "ErrorsActualLength": 0, "Messages": [], "Warnings": [], "Errors": [], "BackendStatistics": { "RemoteCalls": random.randint(5, 15), "BytesUploaded": random.randint(1000000, 100000000), "BytesDownloaded": random.randint(1000000, 100000000), "FilesUploaded": random.randint(1, 10), "FilesDownloaded": random.randint(1, 10), "FilesDeleted": random.randint(0, 5), "FoldersCreated": random.randint(0, 2), "RetryAttempts": 0, "UnknownFileSize": 0, "UnknownFileCount": 0, "KnownFileCount": random.randint(100, 1000), "KnownFileSize": random.randint(1000000000, 8000000000), "LastBackupDate": random_time(), "BackupListCount": random.randint(5, 20), "TotalQuotaSpace": 1932937191424, "FreeQuotaSpace": random.randint(400000000000, 800000000000), "AssignedQuotaSpace": -1, "ReportedQuotaError": False, "ReportedQuotaWarning": False, "MainOperation": "Backup", "ParsedResult": "Success", "Interrupted": False, "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)", "EndTime": "0001-01-01T00:00:00", "BeginTime": random_time(), "Duration": "00:00:00", "MessagesActualLength": 0, "WarningsActualLength": 0, "ErrorsActualLength": 0, "Messages": None, "Warnings": None, "Errors": None } } return log def generate_bad_log(): errors = ["Error: Disk full", "Error: Permission denied", "Error: Network timeout"] warnings = ["Warning: Slow upload", "Warning: File skipped", "Warning: Retry exceeded"] log = { "DeletedFiles": random.randint(100, 10000), # excessive deletions "DeletedFolders": random.randint(10, 100), "ModifiedFiles": random.randint(0, 10), "ExaminedFiles": random.randint(500, 1000), "OpenedFiles": random.randint(0, 10), "AddedFiles": random.randint(0, 2), "SizeOfModifiedFiles": random.randint(1000000, 200000000), "SizeOfAddedFiles": random.randint(0, 1000000), "SizeOfExaminedFiles": random.randint(1000000000, 8000000000), "SizeOfOpenedFiles": random.randint(0, 200000000), "NotProcessedFiles": random.randint(1, 10), "AddedFolders": random.randint(0, 2), "TooLargeFiles": random.randint(0, 2), "FilesWithError": random.randint(1, 5), "ModifiedFolders": random.randint(0, 2), "ModifiedSymlinks": 0, "AddedSymlinks": 0, "DeletedSymlinks": 0, "PartialBackup": True, "Dryrun": False, "MainOperation": "Backup", "ParsedResult": "Error", "Interrupted": True, "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)", "EndTime": random_time(), "BeginTime": random_time(), "Duration": "00:00:{:02d}.{:07d}".format(random.randint(10, 59), random.randint(0, 9999999)), "MessagesActualLength": random.randint(30, 100), "WarningsActualLength": random.randint(5, 20), "ErrorsActualLength": random.randint(1, 5), "Messages": [random.choice(warnings) for _ in range(random.randint(5, 20))], "Warnings": [random.choice(warnings) for _ in range(random.randint(5, 20))], "Errors": [random.choice(errors) for _ in range(random.randint(1, 5))], "BackendStatistics": { "RemoteCalls": random.randint(5, 15), "BytesUploaded": random.randint(100000, 1000000), "BytesDownloaded": random.randint(100000, 1000000), "FilesUploaded": random.randint(0, 2), "FilesDownloaded": random.randint(0, 2), "FilesDeleted": random.randint(100, 10000), "FoldersCreated": random.randint(0, 2), "RetryAttempts": random.randint(1, 10), "UnknownFileSize": random.randint(0, 1), "UnknownFileCount": random.randint(0, 1), "KnownFileCount": random.randint(100, 1000), "KnownFileSize": random.randint(1000000000, 8000000000), "LastBackupDate": random_time(), "BackupListCount": random.randint(5, 20), "TotalQuotaSpace": 1932937191424, "FreeQuotaSpace": random.randint(400000000000, 800000000000), "AssignedQuotaSpace": -1, "ReportedQuotaError": random.choice([True, False]), "ReportedQuotaWarning": random.choice([True, False]), "MainOperation": "Backup", "ParsedResult": "Error", "Interrupted": True, "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)", "EndTime": "0001-01-01T00:00:00", "BeginTime": random_time(), "Duration": "00:00:00", "MessagesActualLength": random.randint(5, 20), "WarningsActualLength": random.randint(5, 20), "ErrorsActualLength": random.randint(1, 5), "Messages": None, "Warnings": None, "Errors": None } } return log def flatten_features(log): """Extracts relevant features from top-level and BackendStatistics.""" stats = log.get("BackendStatistics", {}) return { "KnownFileCount": stats.get("KnownFileCount", 0), "KnownFileSize": stats.get("KnownFileSize", 0), "FilesUploaded": stats.get("FilesUploaded", 0), "FilesDownloaded": stats.get("FilesDownloaded", 0), "FilesDeleted": stats.get("FilesDeleted", 0), "BytesUploaded": stats.get("BytesUploaded", 0), "BytesDownloaded": stats.get("BytesDownloaded", 0), "Duration_sec": random.uniform(30, 600), # You can parse Duration if needed "ErrorsActualLength": log.get("ErrorsActualLength", 0), "WarningsActualLength": log.get("WarningsActualLength", 0), "label": 1 if log.get("ParsedResult") == "Error" or log.get("Interrupted") else 0 } def generate_sample_data(n_good=100, n_bad=10, random_state=42): np.random.seed(random_state) random.seed(random_state) good_logs = [generate_good_log() for _ in range(n_good)] bad_logs = [generate_bad_log() for _ in range(n_bad)] all_logs = good_logs + bad_logs features = [flatten_features(log) for log in all_logs] df = pd.DataFrame(features) return df def try_isolation_forest(df): features = [ "KnownFileCount", "KnownFileSize", "FilesUploaded", "FilesDownloaded", "FilesDeleted", "BytesUploaded", "BytesDownloaded", "Duration_sec", "ErrorsActualLength", "WarningsActualLength" ] clf = IsolationForest(random_state=42) preds = clf.fit_predict(df[features]) df["predicted_anomaly"] = (preds == -1).astype(int) print(df[features + ["label", "predicted_anomaly"]]) if __name__ == "__main__": df = generate_sample_data() try_isolation_forest(df)