ml_nb1.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. from sklearn.naive_bayes import BernoulliNB
  2. import numpy as np
  3. import finopt.ystockquote as yq
  4. import datetime
  5. from dateutil import rrule
  6. import itertools
  7. def weather_play():
  8. # implementing the example in the blog link below
  9. # http://www.analyticsvidhya.com/blog/2015/09/naive-bayes-explained/
  10. # each vector in x represents a predictor of type 'weather' with
  11. # attributes = ['sunny', 'overcast', 'rainy']
  12. # the label / class in y are ['NO', 'YES'] or 0,1
  13. # using Bernoulli because the vectors are in binary
  14. x= np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],
  15. [0,1,0],[0,1,0],[0,1,0],[0,1,0],[0,1,0],
  16. [0,0,1],[0,0,1],[0,0,1],[0,0,1],[0,0,1]])
  17. y = np.array([1,1,1,1,0,0,0,1,1,0,0,1,1,1])
  18. model = BernoulliNB()
  19. model.fit(x,y)
  20. predicted = model.predict([[0,0,1],[1,0,0]])
  21. print predicted
  22. print model.predict_proba([[0,0,1],[1,0,0],[0,1,0]])
  23. print model.feature_count_
  24. def str2datetime(yyyymmdd):
  25. #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4]))
  26. return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[4:6]), int(yyyymmdd[6:8]))
  27. def ystr2datetime(yyyymmdd):
  28. #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4]))
  29. return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[5:7]), int(yyyymmdd[8:10]))
  30. def datetime2ystr(dt):
  31. return '{:%Y-%m-%d}'.format(dt)
  32. def ewh_hsi(rs):
  33. def daily_change(code, frdate, todate, base, numerator):
  34. e0 = yq.get_historical_prices(code, frdate, todate)
  35. print e0
  36. e1 = e0[1:]
  37. e2 = e0[2:]
  38. e3 = map(lambda i: (e2[i][0],
  39. 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
  40. e2[i][numerator], e1[i][base]
  41. ),
  42. [i for i in range(len(e2))])
  43. return e3
  44. idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
  45. EWH = daily_change('^DJI', '20150901', '20160330', idx.index('Adj Clos'), idx.index('Adj Clos'))
  46. #EWH = EWH[:20]
  47. # 1 if opens high and 0 otherwise
  48. HSI = daily_change('^HSI', '20150901', '20160330', idx.index('Open'), idx.index('Adj Clos'))
  49. #HSI = HSI[:20]
  50. print len(EWH), ''.join('%s,' % x[0] for x in EWH)
  51. print len(HSI), ''.join('%s,' % x[0] for x in HSI)
  52. HSI_dates = map(lambda x: x[0], HSI)
  53. # filter EWH entries for which a record has a corresponding next trade record in HSI
  54. # example, EWH trade date 2016-02-29 the corresponding record for HSI is 2016-03-01
  55. EWH_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in HSI_dates,EWH)
  56. print len(EWH_filtered), EWH_filtered
  57. hsi_ewh = map(lambda x:(HSI[HSI_dates.index(
  58. datetime2ystr(rs.after(ystr2datetime(x[0]))))
  59. ][1], x[1]), EWH_filtered)
  60. xx = np.array(map(lambda x: [x[1], 0], hsi_ewh))
  61. yy = np.array(map(lambda x: x[0], hsi_ewh))
  62. model = BernoulliNB()
  63. model.fit(xx,yy)
  64. predicted = model.predict([[0,0], [1,0]])
  65. print predicted
  66. print model.predict_proba([[0,0], [1,0]])
  67. print model.feature_count_
  68. def cartesian_product(a, b):
  69. return [[a0,b0] for a0 in a for b0 in b]
  70. def permutations(size):
  71. #http://thomas-cokelaer.info/blog/2012/11/how-do-use-itertools-in-python-to-build-permutation-or-combination/
  72. return list(itertools.product([0,1], repeat=size))
  73. def predict(rs):
  74. def daily_change(code, frdate, todate, base, numerator):
  75. # compute the next day price change % and return a new binary series where
  76. # 1 - means UP
  77. # 0 - means DOWN
  78. # normailly this is calculated as (price of today - price of yesterday) / price of yesterday
  79. # price type can be specified using the 'base' and 'numerator' parameters
  80. e0 = yq.get_historical_prices(code, frdate, todate)
  81. print e0
  82. e1 = e0[1:]
  83. e2 = e0[2:]
  84. e3 = map(lambda i: (e2[i][0],
  85. 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
  86. e2[i][numerator], e1[i][base],
  87. (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base])
  88. ),
  89. [i for i in range(len(e2))])
  90. return e3
  91. def save_lf_series(name, series):
  92. now = datetime.datetime.now().strftime('%Y%m%d%H%M')
  93. f = open('%s/%s-%s' % ('../dat', name, now), 'w')
  94. f.write(''.join('%s %s,' % (x[0], x[1]) for x in series))
  95. f.close()
  96. def lbl_predictor_parse(c_stock, f_stock, frdate, todate):
  97. idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
  98. feature = daily_change(f_stock, frdate, todate, idx.index('Adj Clos'), idx.index('Adj Clos'))
  99. label = daily_change(c_stock, frdate, todate, idx.index('Open'), idx.index('Adj Clos'))
  100. #HSI = HSI[:20]
  101. print 'F: [%s] Num elements: %d ' % (f_stock, len(feature)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in feature)
  102. print 'L: [%s] Num elements: %d ' % (c_stock, len(label)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in label)
  103. # extract all the label dates
  104. label_trade_dates = map(lambda x: x[0], label)
  105. # filter feature series -
  106. # example, for a record with trade date (T) 2016-02-29, expect to find a label record with date = T+1
  107. # if a match in the lable series couldn't be found, drop the feature record
  108. #
  109. # logic:
  110. # for each record in feature
  111. # determine the next business date of "label" given the feature record's date
  112. # if found, retrain, else, drop
  113. feature_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in label_trade_dates,feature)
  114. print 'Filtered F:[%s] Num elements: %d ' % (f_stock, len(feature_filtered)), feature_filtered
  115. #
  116. # generate a labeledPoint (label, feature)
  117. label_feature = map(lambda x:(label[label_trade_dates.index(
  118. datetime2ystr(rs.after(ystr2datetime(x[0]))))
  119. ][1], x[1]), feature_filtered)
  120. print 'Matched Series [%s:%s] %s' % (c_stock, f_stock, ''.join('(%s,%s),' % (x[0], x[1]) for x in label_feature))
  121. save_lf_series('%s_%s' % (c_stock,f_stock), label_feature)
  122. return label_feature
  123. #features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', '^FCHI', '^FVX', '^FTSE','VNQ','QQQ','GOOG','BAC'], 'date_range': ['20150901', '20160330']}
  124. features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', 'EUR=X', 'JPY=X'], 'date_range': ['20150901', '20160330']}
  125. lf = []
  126. for fs in features_config['fstocks']:
  127. lf.append(lbl_predictor_parse(features_config['cstock'], fs, features_config['date_range'][0], features_config['date_range'][1]))
  128. # lf1 = lbl_predictor_parse('^HSI', '^DJI', '20150901', '20160325')
  129. # lf2 = lbl_predictor_parse('^HSI', '^FTSE', '20150901', '20160325')
  130. # lf3 = lbl_predictor_parse('^HSI', '^HSCE', '20150901', '20160325')
  131. # xx1 = np.array(map(lambda x: [x[1], 0, 0], lf1))
  132. # xx2 = np.array(map(lambda x: [0 , x[1] ,0], lf2))
  133. # xx3 = np.array(map(lambda x: [0 , 0, x[1]], lf3))
  134. # xx = np.concatenate((xx1, xx2, xx3))
  135. # #print xx
  136. # # yy = np.array(map(lambda x: x[0], lf1+lf2+lf3))
  137. # model = BernoulliNB()
  138. # model.fit(xx,yy)
  139. # scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
  140. # predicted = model.predict(scenarios)
  141. # print predicted
  142. # print model.predict_proba(scenarios)
  143. # print model.feature_count_
  144. # build vector
  145. #[DJI, FTSE, HSCE]
  146. points_sp = []
  147. points_sk = []
  148. for i in range(len(lf)):
  149. def spark_friendly(v):
  150. # init a bunch of zeros [0,0,...]
  151. point = [0] * len(lf)
  152. # set the value at column i of the vector
  153. point[i] = v[1]
  154. #print 'spark label:%s feature#:%d' % (v[0], i), point
  155. # retrun a tuple of label, feature
  156. return (v[0], point)
  157. def sklearn_friendly(v):
  158. point = [0] * len(lf)
  159. point[i] = v[1]
  160. #print 'sklearn label:%s feature#:%d' % (v[0], i), point
  161. return point
  162. #print 'len: ' , len(lf[i])
  163. points_sp.append(map(spark_friendly , lf[i]))
  164. points_sk.append(np.array(map(sklearn_friendly, lf[i])))
  165. #
  166. # format [[(1, [1, 0, 0]), (1, [1, 0, 0])], [(0, [0, 0, 0]),...]]
  167. def save_labelled_points(name, pt):
  168. now = datetime.datetime.now().strftime('%Y%m%d%H%M')
  169. now = ''
  170. f = open('%s/%s-%s' % ('../dat', name, now), 'w')
  171. for i in range(len(points_sp)):
  172. for j in range(len(points_sp[i])):
  173. print '%s,%s' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1]))
  174. f.write('%s,%s\n' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1])))
  175. f.close()
  176. print "For pyspark LabeledPoint format: ", points_sp
  177. save_labelled_points('%s-%s' % (features_config['cstock'], '_'.join(s for s in features_config['fstocks'])), points_sp)
  178. points_sk = np.concatenate((points_sk))
  179. print "For sklearn numpy format:\n ", points_sk
  180. #print np.concatenate((points))
  181. #print len(lf[0]+lf[1]+lf[2]), len(reduce(lambda x,y:x+y, lf)) , len(points_sp)
  182. yy = np.array(map(lambda x: x[0], reduce(lambda x,y:x+y, lf)))
  183. model = BernoulliNB()
  184. model.fit(points_sk,yy)
  185. #scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
  186. num_features= len(points_sk[0])
  187. scenarios = permutations(num_features)
  188. predicted = model.predict(scenarios)
  189. print predicted, scenarios
  190. predicted_proba = model.predict_proba(scenarios)
  191. print predicted_proba
  192. print model.feature_count_
  193. print '************** SUMMARY REPORT **************'
  194. print 'Likelihood (%s) GIVEN (%s)' % (features_config['cstock'], ', '.join(s for s in features_config['fstocks']))
  195. print 'Expected\t\tResult\t\tScneario'
  196. for i in range(len(predicted)):
  197. print '%s:\t\t %s\t\t%s' % ('UP' if predicted[i] == 1 else 'DOWN', scenarios[i], predicted_proba[i])
  198. def test():
  199. #[DJI, FTSE, HSCE]
  200. points = []
  201. for i in range(3):
  202. def f1(v):
  203. point = [0] * len(range(3))
  204. point[i] = v
  205. print i, point
  206. return point
  207. points.append(np.array(map(f1 , [7,8,9])))
  208. print points
  209. print np.concatenate((points))
  210. def set_biz_calendar():
  211. #hk holidays
  212. holidays = [str2datetime('20150903'),
  213. str2datetime('20150928'),
  214. str2datetime('20151225'),
  215. str2datetime('20151226'),
  216. str2datetime('20150701'),
  217. str2datetime('20160101'),
  218. str2datetime('20160208'),
  219. str2datetime('20160209'),
  220. str2datetime('20160210'),
  221. str2datetime('20160325'),
  222. str2datetime('20160326'),
  223. str2datetime('20160328'),
  224. str2datetime('20160404'),
  225. str2datetime('20160502')]
  226. r = rrule.rrule(rrule.DAILY,
  227. byweekday=[rrule.MO, rrule.TU, rrule.WE, rrule.TH, rrule.FR],
  228. dtstart = str2datetime('20151201'))
  229. rs = rrule.rruleset()
  230. rs.rrule(r)
  231. for exdate in holidays:
  232. rs.exdate(exdate)
  233. return rs
  234. #print np.array(s1)
  235. if __name__ == '__main__':
  236. #weather_play()
  237. #test()
  238. #
  239. #
  240. # Model:
  241. #
  242. # What is the likelihood of HSI opens high given
  243. # the dow jones or some other indices closed high on
  244. # the previous trading day?
  245. #
  246. rs = set_biz_calendar()
  247. print ''.join('%s,\n' % rs[i] for i in range(5)), rs.after(str2datetime('20160324')),\
  248. datetime2ystr(rs.after(str2datetime('20160324')))
  249. #ewh_hsi(rs)
  250. predict(rs)