analyzer1.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. import pandas as pd
  2. import numpy as np
  3. import yfinance as yf
  4. import matplotlib.pyplot as plt
  5. import seaborn as sns
  6. from sklearn.preprocessing import StandardScaler
  7. from sklearn.decomposition import PCA
  8. from sklearn.cluster import KMeans
  9. from sklearn.ensemble import IsolationForest
  10. from sklearn.svm import OneClassSVM
  11. from sklearn.metrics import silhouette_score
  12. from datetime import datetime, timedelta
  13. import warnings
  14. warnings.filterwarnings('ignore')
  15. class CryptoAnalyzer:
  16. def __init__(self):
  17. # Top traded cryptocurrencies (using their Yahoo Finance tickers)
  18. self.crypto_tickers = [
  19. 'BTC-USD', 'ETH-USD', 'BNB-USD', 'XRP-USD', 'ADA-USD',
  20. 'DOGE-USD', 'SOL-USD', 'DOT-USD', 'MATIC-USD', 'LTC-USD',
  21. 'SHIB-USD', 'TRX-USD', 'AVAX-USD', 'UNI-USD', 'LINK-USD'
  22. ]
  23. self.data = {}
  24. self.scaler = StandardScaler()
  25. self.pca = PCA(n_components=2)
  26. self.kmeans = None
  27. self.isolation_forest = None
  28. self.one_class_svm = None
  29. def download_data(self, period='1y', interval='1d'):
  30. """
  31. Download historical price data for cryptocurrencies
  32. """
  33. print("Downloading cryptocurrency data...")
  34. for ticker in self.crypto_tickers:
  35. try:
  36. crypto_data = yf.download(ticker, period=period, interval=interval)
  37. if not crypto_data.empty:
  38. self.data[ticker] = crypto_data
  39. print(f"✓ Downloaded data for {ticker}")
  40. else:
  41. print(f"✗ No data for {ticker}")
  42. except Exception as e:
  43. print(f"✗ Error downloading {ticker}: {str(e)}")
  44. print(f"Successfully downloaded data for {len(self.data)} cryptocurrencies\n")
  45. return self.data
  46. def preprocess_data(self):
  47. """
  48. Preprocess the downloaded data for analysis
  49. """
  50. print("Preprocessing data...")
  51. # Create features for each cryptocurrency
  52. features_list = []
  53. returns_list = []
  54. for ticker, df in self.data.items():
  55. # Ensure we're working with single columns by selecting specific column names
  56. if isinstance(df['Close'], pd.DataFrame):
  57. close_prices = df['Close'].iloc[:, 0] # Take first column if multiple
  58. else:
  59. close_prices = df['Close']
  60. if isinstance(df['High'], pd.DataFrame):
  61. high_prices = df['High'].iloc[:, 0] # Take first column if multiple
  62. else:
  63. high_prices = df['High']
  64. if isinstance(df['Low'], pd.DataFrame):
  65. low_prices = df['Low'].iloc[:, 0] # Take first column if multiple
  66. else:
  67. low_prices = df['Low']
  68. if isinstance(df['Volume'], pd.DataFrame):
  69. volume_data = df['Volume'].iloc[:, 0] # Take first column if multiple
  70. else:
  71. volume_data = df['Volume']
  72. # Calculate returns and technical indicators
  73. df['Returns'] = close_prices.pct_change()
  74. df['Volatility'] = df['Returns'].rolling(window=30).std()
  75. df['MA_7'] = close_prices.rolling(window=7).mean()
  76. df['MA_30'] = close_prices.rolling(window=30).mean()
  77. df['Price_MA_Ratio'] = close_prices / df['MA_30']
  78. df['Volume_MA'] = volume_data.rolling(window=30).mean()
  79. df['Volume_Ratio'] = volume_data / df['Volume_MA']
  80. # Get the latest data point for each cryptocurrency
  81. latest_data = df.iloc[-1]
  82. features = {
  83. 'Ticker': ticker,
  84. 'Close_Price': close_prices.iloc[-1],
  85. 'Returns': latest_data['Returns'],
  86. 'Volatility': latest_data['Volatility'],
  87. 'Price_MA_Ratio': latest_data['Price_MA_Ratio'],
  88. 'Volume_Ratio': latest_data['Volume_Ratio'],
  89. 'High_Low_Ratio': high_prices.iloc[-1] / low_prices.iloc[-1]
  90. }
  91. features_list.append(features)
  92. # Store returns for similarity analysis
  93. returns_data = df['Returns'].dropna()
  94. returns_list.append({
  95. 'ticker': ticker,
  96. 'returns': returns_data,
  97. 'mean_return': returns_data.mean(),
  98. 'std_return': returns_data.std()
  99. })
  100. self.features_df = pd.DataFrame(features_list)
  101. self.returns_data = returns_list
  102. # Prepare numerical features for clustering and anomaly detection
  103. self.numerical_features = ['Close_Price', 'Returns', 'Volatility',
  104. 'Price_MA_Ratio', 'Volume_Ratio', 'High_Low_Ratio']
  105. # Remove rows with NaN values
  106. self.features_df = self.features_df.dropna()
  107. print("Data preprocessing completed\n")
  108. return self.features_df
  109. def similarity_analysis(self):
  110. """
  111. Perform similarity analysis between cryptocurrencies
  112. """
  113. print("Performing similarity analysis...")
  114. # Calculate correlation matrix based on returns
  115. returns_df = pd.DataFrame()
  116. for item in self.returns_data:
  117. returns_df[item['ticker']] = item['returns'].tail(252) # Last year of data
  118. # Handle missing values by forward filling
  119. returns_df = returns_df.fillna(method='ffill').fillna(method='bfill')
  120. # Calculate correlation matrix
  121. self.correlation_matrix = returns_df.corr()
  122. # Display top similar pairs
  123. print("Top similar cryptocurrency pairs (based on correlation):")
  124. similar_pairs = []
  125. for i in range(len(self.correlation_matrix.columns)):
  126. for j in range(i+1, len(self.correlation_matrix.columns)):
  127. ticker1 = self.correlation_matrix.columns[i]
  128. ticker2 = self.correlation_matrix.columns[j]
  129. correlation = self.correlation_matrix.iloc[i, j]
  130. similar_pairs.append((ticker1, ticker2, correlation))
  131. # Sort by correlation
  132. similar_pairs.sort(key=lambda x: x[2], reverse=True)
  133. print("Top 10 most similar pairs:")
  134. for i, (t1, t2, corr) in enumerate(similar_pairs[:10]):
  135. print(f"{i+1}. {t1} - {t2}: {corr:.3f}")
  136. # Visualize correlation matrix
  137. plt.figure(figsize=(12, 10))
  138. sns.heatmap(self.correlation_matrix, annot=True, cmap='coolwarm', center=0,
  139. fmt='.2f', square=True)
  140. plt.title('Cryptocurrency Returns Correlation Matrix')
  141. plt.tight_layout()
  142. plt.show()
  143. print("\nSimilarity analysis completed\n")
  144. return self.correlation_matrix
  145. def build_anomaly_detection_models(self):
  146. """
  147. Build and train anomaly detection models
  148. """
  149. print("Building anomaly detection models...")
  150. # Prepare data for anomaly detection
  151. X = self.features_df[self.numerical_features].values
  152. X_scaled = self.scaler.fit_transform(X)
  153. # 1. K-Means Clustering for outlier detection
  154. self.kmeans = KMeans(n_clusters=3, random_state=42)
  155. cluster_labels = self.kmeans.fit_predict(X_scaled)
  156. self.features_df['Cluster'] = cluster_labels
  157. # 2. Isolation Forest
  158. self.isolation_forest = IsolationForest(contamination=0.1, random_state=42)
  159. isolation_predictions = self.isolation_forest.fit_predict(X_scaled)
  160. self.features_df['Isolation_Anomaly'] = isolation_predictions
  161. # 3. One-Class SVM
  162. self.one_class_svm = OneClassSVM(nu=0.1)
  163. svm_predictions = self.one_class_svm.fit_predict(X_scaled)
  164. self.features_df['SVM_Anomaly'] = svm_predictions
  165. # Combine anomaly detection results
  166. self.features_df['Anomaly_Score'] = (
  167. (self.features_df['Isolation_Anomaly'] == -1).astype(int) +
  168. (self.features_df['SVM_Anomaly'] == -1).astype(int)
  169. )
  170. # Anomalies are detected if 2 or more models flag them
  171. self.features_df['Is_Anomaly'] = self.features_df['Anomaly_Score'] >= 1
  172. print("Anomaly detection models built successfully")
  173. # Display anomalies found
  174. anomalies = self.features_df[self.features_df['Is_Anomaly']]
  175. if len(anomalies) > 0:
  176. print(f"\nDetected {len(anomalies)} potential anomalies:")
  177. for _, row in anomalies.iterrows():
  178. print(f" • {row['Ticker']}: Anomaly Score = {row['Anomaly_Score']}")
  179. else:
  180. print("No anomalies detected in current data")
  181. print("\nModel building completed\n")
  182. return self.features_df
  183. def detect_new_anomalies(self, new_data_point):
  184. """
  185. Detect anomalies in new incoming data
  186. """
  187. print("Detecting anomalies in new data...")
  188. # Preprocess new data point (assuming it's in the same format)
  189. new_scaled = self.scaler.transform([new_data_point])
  190. # Apply all models
  191. kmeans_pred = self.kmeans.predict(new_scaled)[0]
  192. isolation_pred = self.isolation_forest.predict(new_scaled)[0]
  193. svm_pred = self.one_class_svm.predict(new_scaled)[0]
  194. # Calculate anomaly score
  195. anomaly_score = (isolation_pred == -1) + (svm_pred == -1)
  196. is_anomaly = anomaly_score >= 1
  197. result = {
  198. 'Cluster': kmeans_pred,
  199. 'Isolation_Prediction': 'Anomaly' if isolation_pred == -1 else 'Normal',
  200. 'SVM_Prediction': 'Anomaly' if svm_pred == -1 else 'Normal',
  201. 'Anomaly_Score': anomaly_score,
  202. 'Is_Anomaly': is_anomaly
  203. }
  204. print(f"New data point analysis:")
  205. print(f" Cluster: {result['Cluster']}")
  206. print(f" Isolation Forest: {result['Isolation_Prediction']}")
  207. print(f" One-Class SVM: {result['SVM_Prediction']}")
  208. print(f" Overall: {'ANOMALY' if is_anomaly else 'NORMAL'}")
  209. return result
  210. def visualize_results(self):
  211. """
  212. Visualize the results of the analysis
  213. """
  214. print("Generating visualizations...")
  215. # 1. PCA visualization of clusters
  216. X_scaled = self.scaler.transform(self.features_df[self.numerical_features])
  217. X_pca = self.pca.fit_transform(X_scaled)
  218. plt.figure(figsize=(15, 5))
  219. # Plot 1: Clustering results
  220. plt.subplot(1, 3, 1)
  221. scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=self.features_df['Cluster'],
  222. cmap='viridis', alpha=0.7)
  223. plt.colorbar(scatter)
  224. plt.title('Cryptocurrency Clustering (PCA)')
  225. plt.xlabel('First Principal Component')
  226. plt.ylabel('Second Principal Component')
  227. # Plot 2: Anomalies vs Normal points
  228. plt.subplot(1, 3, 2)
  229. colors = ['red' if x else 'blue' for x in self.features_df['Is_Anomaly']]
  230. plt.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.7)
  231. plt.title('Anomaly Detection Results')
  232. plt.xlabel('First Principal Component')
  233. plt.ylabel('Second Principal Component')
  234. # Plot 3: Feature distributions
  235. plt.subplot(1, 3, 3)
  236. self.features_df.boxplot(column=['Returns', 'Volatility'], ax=plt.gca())
  237. plt.title('Returns and Volatility Distribution')
  238. plt.xticks(rotation=45)
  239. plt.tight_layout()
  240. plt.show()
  241. print("Visualizations completed\n")
  242. def generate_report(self):
  243. """
  244. Generate a comprehensive report of the analysis
  245. """
  246. print("="*60)
  247. print("CRYPTOCURRENCY ANALYSIS REPORT")
  248. print("="*60)
  249. print(f"\n1. DATA SUMMARY:")
  250. print(f" • Total cryptocurrencies analyzed: {len(self.features_df)}")
  251. print(f" • Time period: 1 year")
  252. print(f" • Features analyzed: {', '.join(self.numerical_features)}")
  253. print(f"\n2. SIMILARITY ANALYSIS:")
  254. print(f" • Correlation matrix generated for all pairs")
  255. print(f" • Top similar pairs identified")
  256. print(f"\n3. ANOMALY DETECTION:")
  257. anomalies = self.features_df[self.features_df['Is_Anomaly']]
  258. print(f" • Anomalies detected: {len(anomalies)}")
  259. if len(anomalies) > 0:
  260. for _, row in anomalies.iterrows():
  261. print(f" - {row['Ticker']}")
  262. print(f"\n4. CLUSTERING:")
  263. cluster_counts = self.features_df['Cluster'].value_counts()
  264. for cluster, count in cluster_counts.items():
  265. print(f" • Cluster {cluster}: {count} cryptocurrencies")
  266. print("="*60)
  267. # Example usage
  268. def main():
  269. # Initialize the analyzer
  270. analyzer = CryptoAnalyzer()
  271. # Download data
  272. analyzer.download_data(period='1y', interval='1d')
  273. # Preprocess data
  274. analyzer.preprocess_data()
  275. # Perform similarity analysis
  276. analyzer.similarity_analysis()
  277. # Build anomaly detection models
  278. analyzer.build_anomaly_detection_models()
  279. # Visualize results
  280. analyzer.visualize_results()
  281. # Generate report
  282. analyzer.generate_report()
  283. # Example of detecting new anomalies
  284. print("Example: Detecting anomaly for new data point...")
  285. # Example new data point (features in the same order as numerical_features)
  286. example_new_point = [40000, 0.02, 0.05, 1.1, 1.5, 1.02] # BTC-like features
  287. analyzer.detect_new_anomalies(example_new_point)
  288. if __name__ == "__main__":
  289. main()