anomaly_detect2.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. import numpy as np
  2. import pandas as pd
  3. import random
  4. from datetime import datetime, timedelta
  5. from sklearn.ensemble import IsolationForest
  6. def random_time():
  7. start = datetime(2020, 1, 1)
  8. end = datetime.now()
  9. delta = end - start
  10. random_seconds = random.randint(0, int(delta.total_seconds()))
  11. return (start + timedelta(seconds=random_seconds)).isoformat()
  12. def generate_good_log():
  13. log = {
  14. "DeletedFiles": random.randint(0, 5),
  15. "DeletedFolders": random.randint(0, 2),
  16. "ModifiedFiles": random.randint(0, 50),
  17. "ExaminedFiles": random.randint(500, 1000),
  18. "OpenedFiles": random.randint(0, 50),
  19. "AddedFiles": random.randint(0, 10),
  20. "SizeOfModifiedFiles": random.randint(1000000, 2000000000),
  21. "SizeOfAddedFiles": random.randint(0, 10000000),
  22. "SizeOfExaminedFiles": random.randint(1000000000, 8000000000),
  23. "SizeOfOpenedFiles": random.randint(0, 2000000000),
  24. "NotProcessedFiles": 0,
  25. "AddedFolders": random.randint(0, 2),
  26. "TooLargeFiles": 0,
  27. "FilesWithError": 0,
  28. "ModifiedFolders": random.randint(0, 2),
  29. "ModifiedSymlinks": 0,
  30. "AddedSymlinks": 0,
  31. "DeletedSymlinks": 0,
  32. "PartialBackup": False,
  33. "Dryrun": False,
  34. "MainOperation": "Backup",
  35. "ParsedResult": "Success",
  36. "Interrupted": False,
  37. "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
  38. "EndTime": random_time(),
  39. "BeginTime": random_time(),
  40. "Duration": "00:00:{:02d}.{:07d}".format(random.randint(10, 59), random.randint(0, 9999999)),
  41. "MessagesActualLength": random.randint(10, 30),
  42. "WarningsActualLength": random.randint(0, 1),
  43. "ErrorsActualLength": 0,
  44. "Messages": [],
  45. "Warnings": [],
  46. "Errors": [],
  47. "BackendStatistics": {
  48. "RemoteCalls": random.randint(5, 15),
  49. "BytesUploaded": random.randint(1000000, 100000000),
  50. "BytesDownloaded": random.randint(1000000, 100000000),
  51. "FilesUploaded": random.randint(1, 10),
  52. "FilesDownloaded": random.randint(1, 10),
  53. "FilesDeleted": random.randint(0, 5),
  54. "FoldersCreated": random.randint(0, 2),
  55. "RetryAttempts": 0,
  56. "UnknownFileSize": 0,
  57. "UnknownFileCount": 0,
  58. "KnownFileCount": random.randint(100, 1000),
  59. "KnownFileSize": random.randint(1000000000, 8000000000),
  60. "LastBackupDate": random_time(),
  61. "BackupListCount": random.randint(5, 20),
  62. "TotalQuotaSpace": 1932937191424,
  63. "FreeQuotaSpace": random.randint(400000000000, 800000000000),
  64. "AssignedQuotaSpace": -1,
  65. "ReportedQuotaError": False,
  66. "ReportedQuotaWarning": False,
  67. "MainOperation": "Backup",
  68. "ParsedResult": "Success",
  69. "Interrupted": False,
  70. "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
  71. "EndTime": "0001-01-01T00:00:00",
  72. "BeginTime": random_time(),
  73. "Duration": "00:00:00",
  74. "MessagesActualLength": 0,
  75. "WarningsActualLength": 0,
  76. "ErrorsActualLength": 0,
  77. "Messages": None,
  78. "Warnings": None,
  79. "Errors": None
  80. }
  81. }
  82. return log
  83. def generate_bad_log():
  84. errors = ["Error: Disk full", "Error: Permission denied", "Error: Network timeout"]
  85. warnings = ["Warning: Slow upload", "Warning: File skipped", "Warning: Retry exceeded"]
  86. log = {
  87. "DeletedFiles": random.randint(100, 10000), # excessive deletions
  88. "DeletedFolders": random.randint(10, 100),
  89. "ModifiedFiles": random.randint(0, 10),
  90. "ExaminedFiles": random.randint(500, 1000),
  91. "OpenedFiles": random.randint(0, 10),
  92. "AddedFiles": random.randint(0, 2),
  93. "SizeOfModifiedFiles": random.randint(1000000, 200000000),
  94. "SizeOfAddedFiles": random.randint(0, 1000000),
  95. "SizeOfExaminedFiles": random.randint(1000000000, 8000000000),
  96. "SizeOfOpenedFiles": random.randint(0, 200000000),
  97. "NotProcessedFiles": random.randint(1, 10),
  98. "AddedFolders": random.randint(0, 2),
  99. "TooLargeFiles": random.randint(0, 2),
  100. "FilesWithError": random.randint(1, 5),
  101. "ModifiedFolders": random.randint(0, 2),
  102. "ModifiedSymlinks": 0,
  103. "AddedSymlinks": 0,
  104. "DeletedSymlinks": 0,
  105. "PartialBackup": True,
  106. "Dryrun": False,
  107. "MainOperation": "Backup",
  108. "ParsedResult": "Error",
  109. "Interrupted": True,
  110. "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
  111. "EndTime": random_time(),
  112. "BeginTime": random_time(),
  113. "Duration": "00:00:{:02d}.{:07d}".format(random.randint(10, 59), random.randint(0, 9999999)),
  114. "MessagesActualLength": random.randint(30, 100),
  115. "WarningsActualLength": random.randint(5, 20),
  116. "ErrorsActualLength": random.randint(1, 5),
  117. "Messages": [random.choice(warnings) for _ in range(random.randint(5, 20))],
  118. "Warnings": [random.choice(warnings) for _ in range(random.randint(5, 20))],
  119. "Errors": [random.choice(errors) for _ in range(random.randint(1, 5))],
  120. "BackendStatistics": {
  121. "RemoteCalls": random.randint(5, 15),
  122. "BytesUploaded": random.randint(100000, 1000000),
  123. "BytesDownloaded": random.randint(100000, 1000000),
  124. "FilesUploaded": random.randint(0, 2),
  125. "FilesDownloaded": random.randint(0, 2),
  126. "FilesDeleted": random.randint(100, 10000),
  127. "FoldersCreated": random.randint(0, 2),
  128. "RetryAttempts": random.randint(1, 10),
  129. "UnknownFileSize": random.randint(0, 1),
  130. "UnknownFileCount": random.randint(0, 1),
  131. "KnownFileCount": random.randint(100, 1000),
  132. "KnownFileSize": random.randint(1000000000, 8000000000),
  133. "LastBackupDate": random_time(),
  134. "BackupListCount": random.randint(5, 20),
  135. "TotalQuotaSpace": 1932937191424,
  136. "FreeQuotaSpace": random.randint(400000000000, 800000000000),
  137. "AssignedQuotaSpace": -1,
  138. "ReportedQuotaError": random.choice([True, False]),
  139. "ReportedQuotaWarning": random.choice([True, False]),
  140. "MainOperation": "Backup",
  141. "ParsedResult": "Error",
  142. "Interrupted": True,
  143. "Version": "2.1.0.4 (2.1.0.4_stable_2025-01-31)",
  144. "EndTime": "0001-01-01T00:00:00",
  145. "BeginTime": random_time(),
  146. "Duration": "00:00:00",
  147. "MessagesActualLength": random.randint(5, 20),
  148. "WarningsActualLength": random.randint(5, 20),
  149. "ErrorsActualLength": random.randint(1, 5),
  150. "Messages": None,
  151. "Warnings": None,
  152. "Errors": None
  153. }
  154. }
  155. return log
  156. def flatten_features(log):
  157. """Extracts relevant features from top-level and BackendStatistics."""
  158. stats = log.get("BackendStatistics", {})
  159. return {
  160. "KnownFileCount": stats.get("KnownFileCount", 0),
  161. "KnownFileSize": stats.get("KnownFileSize", 0),
  162. "FilesUploaded": stats.get("FilesUploaded", 0),
  163. "FilesDownloaded": stats.get("FilesDownloaded", 0),
  164. "FilesDeleted": stats.get("FilesDeleted", 0),
  165. "BytesUploaded": stats.get("BytesUploaded", 0),
  166. "BytesDownloaded": stats.get("BytesDownloaded", 0),
  167. "Duration_sec": random.uniform(30, 600), # You can parse Duration if needed
  168. "ErrorsActualLength": log.get("ErrorsActualLength", 0),
  169. "WarningsActualLength": log.get("WarningsActualLength", 0),
  170. "label": 1 if log.get("ParsedResult") == "Error" or log.get("Interrupted") else 0
  171. }
  172. def generate_sample_data(n_good=100, n_bad=10, random_state=42):
  173. np.random.seed(random_state)
  174. random.seed(random_state)
  175. good_logs = [generate_good_log() for _ in range(n_good)]
  176. bad_logs = [generate_bad_log() for _ in range(n_bad)]
  177. all_logs = good_logs + bad_logs
  178. features = [flatten_features(log) for log in all_logs]
  179. df = pd.DataFrame(features)
  180. return df
  181. def try_isolation_forest(df):
  182. features = [
  183. "KnownFileCount", "KnownFileSize", "FilesUploaded", "FilesDownloaded",
  184. "FilesDeleted", "BytesUploaded", "BytesDownloaded",
  185. "Duration_sec", "ErrorsActualLength", "WarningsActualLength"
  186. ]
  187. clf = IsolationForest(random_state=42)
  188. preds = clf.fit_predict(df[features])
  189. df["predicted_anomaly"] = (preds == -1).astype(int)
  190. print(df[features + ["label", "predicted_anomaly"]])
  191. if __name__ == "__main__":
  192. df = generate_sample_data()
  193. try_isolation_forest(df)