import numpy as np # type: ignore import pandas as pd from sklearn.ensemble import IsolationForest def generate_sample_data(n_normal=100, n_anomaly=10, random_state=42): np.random.seed(random_state) # Generate normal data (e.g., backup size, duration, file count) normal_data = np.random.normal(loc=[1000, 60, 500], scale=[100, 10, 50], size=(n_normal, 3)) # Generate anomalous data (e.g., much larger/smaller values) anomaly_data = np.random.normal(loc=[2000, 200, 50], scale=[50, 20, 10], size=(n_anomaly, 3)) # Combine and create DataFrame data = np.vstack([normal_data, anomaly_data]) df = pd.DataFrame(data, columns=["backup_size_mb", "duration_sec", "file_count"]) df["label"] = [0]*n_normal + [1]*n_anomaly # 0: normal, 1: anomaly return df def try_isolation_forest(df): clf = IsolationForest(random_state=42) features = df[["backup_size_mb", "duration_sec", "file_count"]] preds = clf.fit_predict(features) df["predicted_anomaly"] = (preds == -1).astype(int) print(df[["backup_size_mb", "duration_sec", "file_count", "label", "predicted_anomaly"]]) if __name__ == "__main__": df = generate_sample_data() try_isolation_forest(df)