|
|
@@ -0,0 +1,315 @@
|
|
|
+from sklearn.naive_bayes import BernoulliNB
|
|
|
+import numpy as np
|
|
|
+import finopt.ystockquote as yq
|
|
|
+import datetime
|
|
|
+from dateutil import rrule
|
|
|
+import itertools
|
|
|
+
|
|
|
+def weather_play():
|
|
|
+
|
|
|
+ # implementing the example in the blog link below
|
|
|
+ # http://www.analyticsvidhya.com/blog/2015/09/naive-bayes-explained/
|
|
|
+ # each vector in x represents a predictor of type 'weather' with
|
|
|
+ # attributes = ['sunny', 'overcast', 'rainy']
|
|
|
+ # the label / class in y are ['NO', 'YES'] or 0,1
|
|
|
+
|
|
|
+ # using Bernoulli because the vectors are in binary
|
|
|
+
|
|
|
+ x= np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],
|
|
|
+ [0,1,0],[0,1,0],[0,1,0],[0,1,0],[0,1,0],
|
|
|
+ [0,0,1],[0,0,1],[0,0,1],[0,0,1],[0,0,1]])
|
|
|
+
|
|
|
+ y = np.array([1,1,1,1,0,0,0,1,1,0,0,1,1,1])
|
|
|
+
|
|
|
+ model = BernoulliNB()
|
|
|
+ model.fit(x,y)
|
|
|
+ predicted = model.predict([[0,0,1],[1,0,0]])
|
|
|
+ print predicted
|
|
|
+ print model.predict_proba([[0,0,1],[1,0,0],[0,1,0]])
|
|
|
+ print model.feature_count_
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def str2datetime(yyyymmdd):
|
|
|
+ #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4]))
|
|
|
+ return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[4:6]), int(yyyymmdd[6:8]))
|
|
|
+
|
|
|
+
|
|
|
+def ystr2datetime(yyyymmdd):
|
|
|
+ #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4]))
|
|
|
+ return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[5:7]), int(yyyymmdd[8:10]))
|
|
|
+
|
|
|
+def datetime2ystr(dt):
|
|
|
+ return '{:%Y-%m-%d}'.format(dt)
|
|
|
+
|
|
|
+def ewh_hsi(rs):
|
|
|
+
|
|
|
+ def daily_change(code, frdate, todate, base, numerator):
|
|
|
+ e0 = yq.get_historical_prices(code, frdate, todate)
|
|
|
+ print e0
|
|
|
+ e1 = e0[1:]
|
|
|
+ e2 = e0[2:]
|
|
|
+
|
|
|
+ e3 = map(lambda i: (e2[i][0],
|
|
|
+ 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
|
|
|
+ e2[i][numerator], e1[i][base]
|
|
|
+ ),
|
|
|
+ [i for i in range(len(e2))])
|
|
|
+ return e3
|
|
|
+
|
|
|
+ idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
|
|
|
+ EWH = daily_change('^DJI', '20150901', '20160330', idx.index('Adj Clos'), idx.index('Adj Clos'))
|
|
|
+ #EWH = EWH[:20]
|
|
|
+ # 1 if opens high and 0 otherwise
|
|
|
+ HSI = daily_change('^HSI', '20150901', '20160330', idx.index('Open'), idx.index('Adj Clos'))
|
|
|
+ #HSI = HSI[:20]
|
|
|
+ print len(EWH), ''.join('%s,' % x[0] for x in EWH)
|
|
|
+ print len(HSI), ''.join('%s,' % x[0] for x in HSI)
|
|
|
+ HSI_dates = map(lambda x: x[0], HSI)
|
|
|
+ # filter EWH entries for which a record has a corresponding next trade record in HSI
|
|
|
+ # example, EWH trade date 2016-02-29 the corresponding record for HSI is 2016-03-01
|
|
|
+ EWH_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in HSI_dates,EWH)
|
|
|
+ print len(EWH_filtered), EWH_filtered
|
|
|
+ hsi_ewh = map(lambda x:(HSI[HSI_dates.index(
|
|
|
+ datetime2ystr(rs.after(ystr2datetime(x[0]))))
|
|
|
+ ][1], x[1]), EWH_filtered)
|
|
|
+
|
|
|
+ xx = np.array(map(lambda x: [x[1], 0], hsi_ewh))
|
|
|
+ yy = np.array(map(lambda x: x[0], hsi_ewh))
|
|
|
+
|
|
|
+ model = BernoulliNB()
|
|
|
+ model.fit(xx,yy)
|
|
|
+ predicted = model.predict([[0,0], [1,0]])
|
|
|
+ print predicted
|
|
|
+ print model.predict_proba([[0,0], [1,0]])
|
|
|
+ print model.feature_count_
|
|
|
+
|
|
|
+
|
|
|
+def cartesian_product(a, b):
|
|
|
+ return [[a0,b0] for a0 in a for b0 in b]
|
|
|
+
|
|
|
+def permutations(size):
|
|
|
+ #http://thomas-cokelaer.info/blog/2012/11/how-do-use-itertools-in-python-to-build-permutation-or-combination/
|
|
|
+ return list(itertools.product([0,1], repeat=size))
|
|
|
+
|
|
|
+def predict(rs):
|
|
|
+
|
|
|
+ def daily_change(code, frdate, todate, base, numerator):
|
|
|
+ # compute the next day price change % and return a new binary series where
|
|
|
+ # 1 - means UP
|
|
|
+ # 0 - means DOWN
|
|
|
+ # normailly this is calculated as (price of today - price of yesterday) / price of yesterday
|
|
|
+ # price type can be specified using the 'base' and 'numerator' parameters
|
|
|
+
|
|
|
+ e0 = yq.get_historical_prices(code, frdate, todate)
|
|
|
+ print e0
|
|
|
+ e1 = e0[1:]
|
|
|
+ e2 = e0[2:]
|
|
|
+
|
|
|
+ e3 = map(lambda i: (e2[i][0],
|
|
|
+ 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
|
|
|
+ e2[i][numerator], e1[i][base],
|
|
|
+ (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base])
|
|
|
+ ),
|
|
|
+ [i for i in range(len(e2))])
|
|
|
+ return e3
|
|
|
+
|
|
|
+ def save_lf_series(name, series):
|
|
|
+ now = datetime.datetime.now().strftime('%Y%m%d%H%M')
|
|
|
+ f = open('%s/%s-%s' % ('../dat', name, now), 'w')
|
|
|
+ f.write(''.join('%s %s,' % (x[0], x[1]) for x in series))
|
|
|
+ f.close()
|
|
|
+
|
|
|
+ def lbl_predictor_parse(c_stock, f_stock, frdate, todate):
|
|
|
+
|
|
|
+ idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
|
|
|
+ feature = daily_change(f_stock, frdate, todate, idx.index('Adj Clos'), idx.index('Adj Clos'))
|
|
|
+
|
|
|
+
|
|
|
+ label = daily_change(c_stock, frdate, todate, idx.index('Open'), idx.index('Adj Clos'))
|
|
|
+ #HSI = HSI[:20]
|
|
|
+ print 'F: [%s] Num elements: %d ' % (f_stock, len(feature)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in feature)
|
|
|
+ print 'L: [%s] Num elements: %d ' % (c_stock, len(label)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in label)
|
|
|
+
|
|
|
+ # extract all the label dates
|
|
|
+ label_trade_dates = map(lambda x: x[0], label)
|
|
|
+ # filter feature series -
|
|
|
+ # example, for a record with trade date (T) 2016-02-29, expect to find a label record with date = T+1
|
|
|
+ # if a match in the lable series couldn't be found, drop the feature record
|
|
|
+ #
|
|
|
+ # logic:
|
|
|
+ # for each record in feature
|
|
|
+ # determine the next business date of "label" given the feature record's date
|
|
|
+ # if found, retrain, else, drop
|
|
|
+ feature_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in label_trade_dates,feature)
|
|
|
+ print 'Filtered F:[%s] Num elements: %d ' % (f_stock, len(feature_filtered)), feature_filtered
|
|
|
+ #
|
|
|
+ # generate a labeledPoint (label, feature)
|
|
|
+ label_feature = map(lambda x:(label[label_trade_dates.index(
|
|
|
+ datetime2ystr(rs.after(ystr2datetime(x[0]))))
|
|
|
+ ][1], x[1]), feature_filtered)
|
|
|
+ print 'Matched Series [%s:%s] %s' % (c_stock, f_stock, ''.join('(%s,%s),' % (x[0], x[1]) for x in label_feature))
|
|
|
+
|
|
|
+ save_lf_series('%s_%s' % (c_stock,f_stock), label_feature)
|
|
|
+
|
|
|
+ return label_feature
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ #features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', '^FCHI', '^FVX', '^FTSE','VNQ','QQQ','GOOG','BAC'], 'date_range': ['20150901', '20160330']}
|
|
|
+ features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', 'EUR=X', 'JPY=X'], 'date_range': ['20150901', '20160330']}
|
|
|
+ lf = []
|
|
|
+ for fs in features_config['fstocks']:
|
|
|
+ lf.append(lbl_predictor_parse(features_config['cstock'], fs, features_config['date_range'][0], features_config['date_range'][1]))
|
|
|
+
|
|
|
+# lf1 = lbl_predictor_parse('^HSI', '^DJI', '20150901', '20160325')
|
|
|
+# lf2 = lbl_predictor_parse('^HSI', '^FTSE', '20150901', '20160325')
|
|
|
+# lf3 = lbl_predictor_parse('^HSI', '^HSCE', '20150901', '20160325')
|
|
|
+# xx1 = np.array(map(lambda x: [x[1], 0, 0], lf1))
|
|
|
+# xx2 = np.array(map(lambda x: [0 , x[1] ,0], lf2))
|
|
|
+# xx3 = np.array(map(lambda x: [0 , 0, x[1]], lf3))
|
|
|
+# xx = np.concatenate((xx1, xx2, xx3))
|
|
|
+# #print xx
|
|
|
+# # yy = np.array(map(lambda x: x[0], lf1+lf2+lf3))
|
|
|
+# model = BernoulliNB()
|
|
|
+# model.fit(xx,yy)
|
|
|
+# scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
|
|
|
+# predicted = model.predict(scenarios)
|
|
|
+# print predicted
|
|
|
+# print model.predict_proba(scenarios)
|
|
|
+# print model.feature_count_
|
|
|
+
|
|
|
+ # build vector
|
|
|
+ #[DJI, FTSE, HSCE]
|
|
|
+ points_sp = []
|
|
|
+ points_sk = []
|
|
|
+ for i in range(len(lf)):
|
|
|
+
|
|
|
+ def spark_friendly(v):
|
|
|
+ # init a bunch of zeros [0,0,...]
|
|
|
+ point = [0] * len(lf)
|
|
|
+ # set the value at column i of the vector
|
|
|
+ point[i] = v[1]
|
|
|
+ #print 'spark label:%s feature#:%d' % (v[0], i), point
|
|
|
+ # retrun a tuple of label, feature
|
|
|
+ return (v[0], point)
|
|
|
+
|
|
|
+ def sklearn_friendly(v):
|
|
|
+ point = [0] * len(lf)
|
|
|
+ point[i] = v[1]
|
|
|
+ #print 'sklearn label:%s feature#:%d' % (v[0], i), point
|
|
|
+ return point
|
|
|
+ #print 'len: ' , len(lf[i])
|
|
|
+ points_sp.append(map(spark_friendly , lf[i]))
|
|
|
+ points_sk.append(np.array(map(sklearn_friendly, lf[i])))
|
|
|
+
|
|
|
+ #
|
|
|
+ # format [[(1, [1, 0, 0]), (1, [1, 0, 0])], [(0, [0, 0, 0]),...]]
|
|
|
+ def save_labelled_points(name, pt):
|
|
|
+ now = datetime.datetime.now().strftime('%Y%m%d%H%M')
|
|
|
+ now = ''
|
|
|
+ f = open('%s/%s-%s' % ('../dat', name, now), 'w')
|
|
|
+
|
|
|
+ for i in range(len(points_sp)):
|
|
|
+ for j in range(len(points_sp[i])):
|
|
|
+ print '%s,%s' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1]))
|
|
|
+ f.write('%s,%s\n' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1])))
|
|
|
+
|
|
|
+ f.close()
|
|
|
+
|
|
|
+ print "For pyspark LabeledPoint format: ", points_sp
|
|
|
+ save_labelled_points('%s-%s' % (features_config['cstock'], '_'.join(s for s in features_config['fstocks'])), points_sp)
|
|
|
+
|
|
|
+ points_sk = np.concatenate((points_sk))
|
|
|
+ print "For sklearn numpy format:\n ", points_sk
|
|
|
+ #print np.concatenate((points))
|
|
|
+
|
|
|
+ #print len(lf[0]+lf[1]+lf[2]), len(reduce(lambda x,y:x+y, lf)) , len(points_sp)
|
|
|
+ yy = np.array(map(lambda x: x[0], reduce(lambda x,y:x+y, lf)))
|
|
|
+ model = BernoulliNB()
|
|
|
+ model.fit(points_sk,yy)
|
|
|
+ #scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
|
|
|
+ num_features= len(points_sk[0])
|
|
|
+ scenarios = permutations(num_features)
|
|
|
+
|
|
|
+ predicted = model.predict(scenarios)
|
|
|
+ print predicted, scenarios
|
|
|
+ predicted_proba = model.predict_proba(scenarios)
|
|
|
+ print predicted_proba
|
|
|
+ print model.feature_count_
|
|
|
+
|
|
|
+ print '************** SUMMARY REPORT **************'
|
|
|
+ print 'Likelihood (%s) GIVEN (%s)' % (features_config['cstock'], ', '.join(s for s in features_config['fstocks']))
|
|
|
+ print 'Expected\t\tResult\t\tScneario'
|
|
|
+
|
|
|
+ for i in range(len(predicted)):
|
|
|
+ print '%s:\t\t %s\t\t%s' % ('UP' if predicted[i] == 1 else 'DOWN', scenarios[i], predicted_proba[i])
|
|
|
+
|
|
|
+def test():
|
|
|
+ #[DJI, FTSE, HSCE]
|
|
|
+ points = []
|
|
|
+ for i in range(3):
|
|
|
+
|
|
|
+ def f1(v):
|
|
|
+
|
|
|
+ point = [0] * len(range(3))
|
|
|
+ point[i] = v
|
|
|
+ print i, point
|
|
|
+ return point
|
|
|
+
|
|
|
+ points.append(np.array(map(f1 , [7,8,9])))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ print points
|
|
|
+ print np.concatenate((points))
|
|
|
+
|
|
|
+
|
|
|
+def set_biz_calendar():
|
|
|
+ #hk holidays
|
|
|
+ holidays = [str2datetime('20150903'),
|
|
|
+ str2datetime('20150928'),
|
|
|
+ str2datetime('20151225'),
|
|
|
+ str2datetime('20151226'),
|
|
|
+ str2datetime('20150701'),
|
|
|
+ str2datetime('20160101'),
|
|
|
+ str2datetime('20160208'),
|
|
|
+ str2datetime('20160209'),
|
|
|
+ str2datetime('20160210'),
|
|
|
+ str2datetime('20160325'),
|
|
|
+ str2datetime('20160326'),
|
|
|
+ str2datetime('20160328'),
|
|
|
+ str2datetime('20160404'),
|
|
|
+ str2datetime('20160502')]
|
|
|
+
|
|
|
+ r = rrule.rrule(rrule.DAILY,
|
|
|
+ byweekday=[rrule.MO, rrule.TU, rrule.WE, rrule.TH, rrule.FR],
|
|
|
+ dtstart = str2datetime('20151201'))
|
|
|
+ rs = rrule.rruleset()
|
|
|
+ rs.rrule(r)
|
|
|
+ for exdate in holidays:
|
|
|
+ rs.exdate(exdate)
|
|
|
+
|
|
|
+
|
|
|
+ return rs
|
|
|
+
|
|
|
+ #print np.array(s1)
|
|
|
+if __name__ == '__main__':
|
|
|
+ #weather_play()
|
|
|
+ #test()
|
|
|
+
|
|
|
+ #
|
|
|
+ #
|
|
|
+ # Model:
|
|
|
+ #
|
|
|
+ # What is the likelihood of HSI opens high given
|
|
|
+ # the dow jones or some other indices closed high on
|
|
|
+ # the previous trading day?
|
|
|
+ #
|
|
|
+ rs = set_biz_calendar()
|
|
|
+ print ''.join('%s,\n' % rs[i] for i in range(5)), rs.after(str2datetime('20160324')),\
|
|
|
+ datetime2ystr(rs.after(str2datetime('20160324')))
|
|
|
+
|
|
|
+ #ewh_hsi(rs)
|
|
|
+ predict(rs)
|