anomaly_detect.py 1.2 KB

1234567891011121314151617181920212223242526
  1. import numpy as np # type: ignore
  2. import pandas as pd
  3. from sklearn.ensemble import IsolationForest
  4. def generate_sample_data(n_normal=100, n_anomaly=10, random_state=42):
  5. np.random.seed(random_state)
  6. # Generate normal data (e.g., backup size, duration, file count)
  7. normal_data = np.random.normal(loc=[1000, 60, 500], scale=[100, 10, 50], size=(n_normal, 3))
  8. # Generate anomalous data (e.g., much larger/smaller values)
  9. anomaly_data = np.random.normal(loc=[2000, 200, 50], scale=[50, 20, 10], size=(n_anomaly, 3))
  10. # Combine and create DataFrame
  11. data = np.vstack([normal_data, anomaly_data])
  12. df = pd.DataFrame(data, columns=["backup_size_mb", "duration_sec", "file_count"])
  13. df["label"] = [0]*n_normal + [1]*n_anomaly # 0: normal, 1: anomaly
  14. return df
  15. def try_isolation_forest(df):
  16. clf = IsolationForest(random_state=42)
  17. features = df[["backup_size_mb", "duration_sec", "file_count"]]
  18. preds = clf.fit_predict(features)
  19. df["predicted_anomaly"] = (preds == -1).astype(int)
  20. print(df[["backup_size_mb", "duration_sec", "file_count", "label", "predicted_anomaly"]])
  21. if __name__ == "__main__":
  22. df = generate_sample_data()
  23. try_isolation_forest(df)