import numpy as np # type: ignore
import pandas as pd
from sklearn.ensemble import IsolationForest

def generate_sample_data(n_normal=100, n_anomaly=10, random_state=42):
    np.random.seed(random_state)
    # Generate normal data (e.g., backup size, duration, file count)
    normal_data = np.random.normal(loc=[1000, 60, 500], scale=[100, 10, 50], size=(n_normal, 3))
    # Generate anomalous data (e.g., much larger/smaller values)
    anomaly_data = np.random.normal(loc=[2000, 200, 50], scale=[50, 20, 10], size=(n_anomaly, 3))
    # Combine and create DataFrame
    data = np.vstack([normal_data, anomaly_data])
    df = pd.DataFrame(data, columns=["backup_size_mb", "duration_sec", "file_count"])
    df["label"] = [0]*n_normal + [1]*n_anomaly  # 0: normal, 1: anomaly
    return df

def try_isolation_forest(df):
    clf = IsolationForest(random_state=42)
    features = df[["backup_size_mb", "duration_sec", "file_count"]]
    preds = clf.fit_predict(features)
    df["predicted_anomaly"] = (preds == -1).astype(int)
    print(df[["backup_size_mb", "duration_sec", "file_count", "label", "predicted_anomaly"]])

if __name__ == "__main__":
    df = generate_sample_data()
    try_isolation_forest(df)