| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315 |
- from sklearn.naive_bayes import BernoulliNB
- import numpy as np
- import finopt.ystockquote as yq
- import datetime
- from dateutil import rrule
- import itertools
- def weather_play():
-
- # implementing the example in the blog link below
- # http://www.analyticsvidhya.com/blog/2015/09/naive-bayes-explained/
- # each vector in x represents a predictor of type 'weather' with
- # attributes = ['sunny', 'overcast', 'rainy']
- # the label / class in y are ['NO', 'YES'] or 0,1
-
- # using Bernoulli because the vectors are in binary
-
- x= np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],
- [0,1,0],[0,1,0],[0,1,0],[0,1,0],[0,1,0],
- [0,0,1],[0,0,1],[0,0,1],[0,0,1],[0,0,1]])
-
- y = np.array([1,1,1,1,0,0,0,1,1,0,0,1,1,1])
-
- model = BernoulliNB()
- model.fit(x,y)
- predicted = model.predict([[0,0,1],[1,0,0]])
- print predicted
- print model.predict_proba([[0,0,1],[1,0,0],[0,1,0]])
- print model.feature_count_
- def str2datetime(yyyymmdd):
- #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4]))
- return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[4:6]), int(yyyymmdd[6:8]))
- def ystr2datetime(yyyymmdd):
- #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4]))
- return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[5:7]), int(yyyymmdd[8:10]))
- def datetime2ystr(dt):
- return '{:%Y-%m-%d}'.format(dt)
- def ewh_hsi(rs):
- def daily_change(code, frdate, todate, base, numerator):
- e0 = yq.get_historical_prices(code, frdate, todate)
- print e0
- e1 = e0[1:]
- e2 = e0[2:]
-
- e3 = map(lambda i: (e2[i][0],
- 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
- e2[i][numerator], e1[i][base]
- ),
- [i for i in range(len(e2))])
- return e3
-
- idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
- EWH = daily_change('^DJI', '20150901', '20160330', idx.index('Adj Clos'), idx.index('Adj Clos'))
- #EWH = EWH[:20]
- # 1 if opens high and 0 otherwise
- HSI = daily_change('^HSI', '20150901', '20160330', idx.index('Open'), idx.index('Adj Clos'))
- #HSI = HSI[:20]
- print len(EWH), ''.join('%s,' % x[0] for x in EWH)
- print len(HSI), ''.join('%s,' % x[0] for x in HSI)
- HSI_dates = map(lambda x: x[0], HSI)
- # filter EWH entries for which a record has a corresponding next trade record in HSI
- # example, EWH trade date 2016-02-29 the corresponding record for HSI is 2016-03-01
- EWH_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in HSI_dates,EWH)
- print len(EWH_filtered), EWH_filtered
- hsi_ewh = map(lambda x:(HSI[HSI_dates.index(
- datetime2ystr(rs.after(ystr2datetime(x[0]))))
- ][1], x[1]), EWH_filtered)
-
- xx = np.array(map(lambda x: [x[1], 0], hsi_ewh))
- yy = np.array(map(lambda x: x[0], hsi_ewh))
-
- model = BernoulliNB()
- model.fit(xx,yy)
- predicted = model.predict([[0,0], [1,0]])
- print predicted
- print model.predict_proba([[0,0], [1,0]])
- print model.feature_count_
-
-
- def cartesian_product(a, b):
- return [[a0,b0] for a0 in a for b0 in b]
-
- def permutations(size):
- #http://thomas-cokelaer.info/blog/2012/11/how-do-use-itertools-in-python-to-build-permutation-or-combination/
- return list(itertools.product([0,1], repeat=size))
- def predict(rs):
-
- def daily_change(code, frdate, todate, base, numerator):
- # compute the next day price change % and return a new binary series where
- # 1 - means UP
- # 0 - means DOWN
- # normailly this is calculated as (price of today - price of yesterday) / price of yesterday
- # price type can be specified using the 'base' and 'numerator' parameters
-
- e0 = yq.get_historical_prices(code, frdate, todate)
- print e0
- e1 = e0[1:]
- e2 = e0[2:]
-
- e3 = map(lambda i: (e2[i][0],
- 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
- e2[i][numerator], e1[i][base],
- (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base])
- ),
- [i for i in range(len(e2))])
- return e3
-
- def save_lf_series(name, series):
- now = datetime.datetime.now().strftime('%Y%m%d%H%M')
- f = open('%s/%s-%s' % ('../dat', name, now), 'w')
- f.write(''.join('%s %s,' % (x[0], x[1]) for x in series))
- f.close()
-
- def lbl_predictor_parse(c_stock, f_stock, frdate, todate):
-
- idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
- feature = daily_change(f_stock, frdate, todate, idx.index('Adj Clos'), idx.index('Adj Clos'))
-
- label = daily_change(c_stock, frdate, todate, idx.index('Open'), idx.index('Adj Clos'))
- #HSI = HSI[:20]
- print 'F: [%s] Num elements: %d ' % (f_stock, len(feature)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in feature)
- print 'L: [%s] Num elements: %d ' % (c_stock, len(label)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in label)
-
- # extract all the label dates
- label_trade_dates = map(lambda x: x[0], label)
- # filter feature series -
- # example, for a record with trade date (T) 2016-02-29, expect to find a label record with date = T+1
- # if a match in the lable series couldn't be found, drop the feature record
- #
- # logic:
- # for each record in feature
- # determine the next business date of "label" given the feature record's date
- # if found, retrain, else, drop
- feature_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in label_trade_dates,feature)
- print 'Filtered F:[%s] Num elements: %d ' % (f_stock, len(feature_filtered)), feature_filtered
- #
- # generate a labeledPoint (label, feature)
- label_feature = map(lambda x:(label[label_trade_dates.index(
- datetime2ystr(rs.after(ystr2datetime(x[0]))))
- ][1], x[1]), feature_filtered)
- print 'Matched Series [%s:%s] %s' % (c_stock, f_stock, ''.join('(%s,%s),' % (x[0], x[1]) for x in label_feature))
-
- save_lf_series('%s_%s' % (c_stock,f_stock), label_feature)
-
- return label_feature
-
-
-
- #features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', '^FCHI', '^FVX', '^FTSE','VNQ','QQQ','GOOG','BAC'], 'date_range': ['20150901', '20160330']}
- features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', 'EUR=X', 'JPY=X'], 'date_range': ['20150901', '20160330']}
- lf = []
- for fs in features_config['fstocks']:
- lf.append(lbl_predictor_parse(features_config['cstock'], fs, features_config['date_range'][0], features_config['date_range'][1]))
-
- # lf1 = lbl_predictor_parse('^HSI', '^DJI', '20150901', '20160325')
- # lf2 = lbl_predictor_parse('^HSI', '^FTSE', '20150901', '20160325')
- # lf3 = lbl_predictor_parse('^HSI', '^HSCE', '20150901', '20160325')
- # xx1 = np.array(map(lambda x: [x[1], 0, 0], lf1))
- # xx2 = np.array(map(lambda x: [0 , x[1] ,0], lf2))
- # xx3 = np.array(map(lambda x: [0 , 0, x[1]], lf3))
- # xx = np.concatenate((xx1, xx2, xx3))
- # #print xx
- # # yy = np.array(map(lambda x: x[0], lf1+lf2+lf3))
- # model = BernoulliNB()
- # model.fit(xx,yy)
- # scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
- # predicted = model.predict(scenarios)
- # print predicted
- # print model.predict_proba(scenarios)
- # print model.feature_count_
- # build vector
- #[DJI, FTSE, HSCE]
- points_sp = []
- points_sk = []
- for i in range(len(lf)):
-
- def spark_friendly(v):
- # init a bunch of zeros [0,0,...]
- point = [0] * len(lf)
- # set the value at column i of the vector
- point[i] = v[1]
- #print 'spark label:%s feature#:%d' % (v[0], i), point
- # retrun a tuple of label, feature
- return (v[0], point)
-
- def sklearn_friendly(v):
- point = [0] * len(lf)
- point[i] = v[1]
- #print 'sklearn label:%s feature#:%d' % (v[0], i), point
- return point
- #print 'len: ' , len(lf[i])
- points_sp.append(map(spark_friendly , lf[i]))
- points_sk.append(np.array(map(sklearn_friendly, lf[i])))
-
- #
- # format [[(1, [1, 0, 0]), (1, [1, 0, 0])], [(0, [0, 0, 0]),...]]
- def save_labelled_points(name, pt):
- now = datetime.datetime.now().strftime('%Y%m%d%H%M')
- now = ''
- f = open('%s/%s-%s' % ('../dat', name, now), 'w')
-
- for i in range(len(points_sp)):
- for j in range(len(points_sp[i])):
- print '%s,%s' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1]))
- f.write('%s,%s\n' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1])))
-
- f.close()
-
- print "For pyspark LabeledPoint format: ", points_sp
- save_labelled_points('%s-%s' % (features_config['cstock'], '_'.join(s for s in features_config['fstocks'])), points_sp)
-
- points_sk = np.concatenate((points_sk))
- print "For sklearn numpy format:\n ", points_sk
- #print np.concatenate((points))
-
- #print len(lf[0]+lf[1]+lf[2]), len(reduce(lambda x,y:x+y, lf)) , len(points_sp)
- yy = np.array(map(lambda x: x[0], reduce(lambda x,y:x+y, lf)))
- model = BernoulliNB()
- model.fit(points_sk,yy)
- #scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
- num_features= len(points_sk[0])
- scenarios = permutations(num_features)
-
- predicted = model.predict(scenarios)
- print predicted, scenarios
- predicted_proba = model.predict_proba(scenarios)
- print predicted_proba
- print model.feature_count_
- print '************** SUMMARY REPORT **************'
- print 'Likelihood (%s) GIVEN (%s)' % (features_config['cstock'], ', '.join(s for s in features_config['fstocks']))
- print 'Expected\t\tResult\t\tScneario'
-
- for i in range(len(predicted)):
- print '%s:\t\t %s\t\t%s' % ('UP' if predicted[i] == 1 else 'DOWN', scenarios[i], predicted_proba[i])
-
- def test():
- #[DJI, FTSE, HSCE]
- points = []
- for i in range(3):
-
- def f1(v):
-
- point = [0] * len(range(3))
- point[i] = v
- print i, point
- return point
-
- points.append(np.array(map(f1 , [7,8,9])))
-
-
- print points
- print np.concatenate((points))
-
-
- def set_biz_calendar():
- #hk holidays
- holidays = [str2datetime('20150903'),
- str2datetime('20150928'),
- str2datetime('20151225'),
- str2datetime('20151226'),
- str2datetime('20150701'),
- str2datetime('20160101'),
- str2datetime('20160208'),
- str2datetime('20160209'),
- str2datetime('20160210'),
- str2datetime('20160325'),
- str2datetime('20160326'),
- str2datetime('20160328'),
- str2datetime('20160404'),
- str2datetime('20160502')]
-
- r = rrule.rrule(rrule.DAILY,
- byweekday=[rrule.MO, rrule.TU, rrule.WE, rrule.TH, rrule.FR],
- dtstart = str2datetime('20151201'))
- rs = rrule.rruleset()
- rs.rrule(r)
- for exdate in holidays:
- rs.exdate(exdate)
-
-
- return rs
-
- #print np.array(s1)
- if __name__ == '__main__':
- #weather_play()
- #test()
-
- #
- #
- # Model:
- #
- # What is the likelihood of HSI opens high given
- # the dow jones or some other indices closed high on
- # the previous trading day?
- #
- rs = set_biz_calendar()
- print ''.join('%s,\n' % rs[i] for i in range(5)), rs.after(str2datetime('20160324')),\
- datetime2ystr(rs.after(str2datetime('20160324')))
-
- #ewh_hsi(rs)
- predict(rs)
|