| 1234567891011121314151617181920212223242526 |
- import numpy as np # type: ignore
- import pandas as pd
- from sklearn.ensemble import IsolationForest
- def generate_sample_data(n_normal=100, n_anomaly=10, random_state=42):
- np.random.seed(random_state)
- # Generate normal data (e.g., backup size, duration, file count)
- normal_data = np.random.normal(loc=[1000, 60, 500], scale=[100, 10, 50], size=(n_normal, 3))
- # Generate anomalous data (e.g., much larger/smaller values)
- anomaly_data = np.random.normal(loc=[2000, 200, 50], scale=[50, 20, 10], size=(n_anomaly, 3))
- # Combine and create DataFrame
- data = np.vstack([normal_data, anomaly_data])
- df = pd.DataFrame(data, columns=["backup_size_mb", "duration_sec", "file_count"])
- df["label"] = [0]*n_normal + [1]*n_anomaly # 0: normal, 1: anomaly
- return df
- def try_isolation_forest(df):
- clf = IsolationForest(random_state=42)
- features = df[["backup_size_mb", "duration_sec", "file_count"]]
- preds = clf.fit_predict(features)
- df["predicted_anomaly"] = (preds == -1).astype(int)
- print(df[["backup_size_mb", "duration_sec", "file_count", "label", "predicted_anomaly"]])
- if __name__ == "__main__":
- df = generate_sample_data()
- try_isolation_forest(df)
|