gogadmin
/
finopt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
							from sklearn.naive_bayes import BernoulliNB
import numpy as np
import finopt.ystockquote as yq
import datetime
from dateutil import rrule
import itertools

def weather_play():
    
    # implementing the example in the blog link below
    # http://www.analyticsvidhya.com/blog/2015/09/naive-bayes-explained/
    # each vector in x represents a predictor of type 'weather' with
    # attributes = ['sunny', 'overcast', 'rainy']
    # the label / class in y are ['NO', 'YES'] or 0,1
    
    # using Bernoulli because the vectors are in binary 
    
    x= np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],
                [0,1,0],[0,1,0],[0,1,0],[0,1,0],[0,1,0],
                [0,0,1],[0,0,1],[0,0,1],[0,0,1],[0,0,1]])
                
    y = np.array([1,1,1,1,0,0,0,1,1,0,0,1,1,1])
    
    model = BernoulliNB()
    model.fit(x,y)
    predicted = model.predict([[0,0,1],[1,0,0]])
    print predicted
    print model.predict_proba([[0,0,1],[1,0,0],[0,1,0]])
    print model.feature_count_


def str2datetime(yyyymmdd):
    #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4])) 
    return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[4:6]), int(yyyymmdd[6:8]))


def ystr2datetime(yyyymmdd):
    #print '%d%d%d'% (int(yyyymmdd[6:8]), int(yyyymmdd[4:6])-1 , int(yyyymmdd[0:4])) 
    return datetime.datetime(int(yyyymmdd[0:4]), int(yyyymmdd[5:7]), int(yyyymmdd[8:10]))

def datetime2ystr(dt):
    return '{:%Y-%m-%d}'.format(dt)

def ewh_hsi(rs):

    def daily_change(code, frdate, todate, base, numerator):
        e0 = yq.get_historical_prices(code, frdate, todate)
        print e0
        e1 = e0[1:]
        e2 = e0[2:]
        
        e3 = map(lambda i: (e2[i][0], 
                            1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
                            e2[i][numerator], e1[i][base]
                            ), 
                            [i for i in range(len(e2))])
        return e3
    
    idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
    EWH = daily_change('^DJI', '20150901', '20160330', idx.index('Adj Clos'), idx.index('Adj Clos'))
    #EWH = EWH[:20]
    # 1 if opens high and 0 otherwise
    HSI = daily_change('^HSI', '20150901', '20160330', idx.index('Open'), idx.index('Adj Clos'))
    #HSI = HSI[:20]
    print len(EWH), ''.join('%s,' % x[0] for x in EWH)
    print len(HSI), ''.join('%s,' % x[0] for x in HSI)
    HSI_dates = map(lambda x: x[0], HSI)
    # filter EWH entries for which a record has a corresponding next trade record in HSI
    # example, EWH trade date 2016-02-29 the corresponding record for HSI is 2016-03-01
    EWH_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in HSI_dates,EWH)
    print len(EWH_filtered),  EWH_filtered
    hsi_ewh = map(lambda x:(HSI[HSI_dates.index(
                                            datetime2ystr(rs.after(ystr2datetime(x[0]))))
                                            ][1], x[1]), EWH_filtered)
    
    xx = np.array(map(lambda x: [x[1], 0], hsi_ewh))
    yy = np.array(map(lambda x: x[0], hsi_ewh))
    
    model = BernoulliNB()
    model.fit(xx,yy)
    predicted = model.predict([[0,0], [1,0]])
    print predicted
    print model.predict_proba([[0,0], [1,0]])
    print model.feature_count_    
    
    
def cartesian_product(a, b):
    return [[a0,b0] for a0 in a for b0 in b]
    
def permutations(size):
    #http://thomas-cokelaer.info/blog/2012/11/how-do-use-itertools-in-python-to-build-permutation-or-combination/
    return list(itertools.product([0,1], repeat=size))

def predict(rs):
    
    def daily_change(code, frdate, todate, base, numerator):
        # compute the next day price change % and return a new binary series where 
        # 1 - means UP
        # 0 - means DOWN
        # normailly this is calculated as (price of today - price of yesterday) / price of yesterday
        # price type can be specified using the 'base' and 'numerator' parameters
        
        e0 = yq.get_historical_prices(code, frdate, todate)
        print e0
        e1 = e0[1:]
        e2 = e0[2:]
        
        e3 = map(lambda i: (e2[i][0], 
                            1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0,
                            e2[i][numerator], e1[i][base],                            
                            (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base])
                            ),
                            [i for i in range(len(e2))])
        return e3
   
    def save_lf_series(name, series):
        now = datetime.datetime.now().strftime('%Y%m%d%H%M')
        f = open('%s/%s-%s' % ('../dat', name, now), 'w')
        f.write(''.join('%s %s,' % (x[0], x[1]) for x in series))
        f.close()
    
    def lbl_predictor_parse(c_stock, f_stock, frdate, todate):
    
        idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
        feature = daily_change(f_stock, frdate, todate, idx.index('Adj Clos'), idx.index('Adj Clos'))

    
        label = daily_change(c_stock, frdate, todate, idx.index('Open'), idx.index('Adj Clos'))
        #HSI = HSI[:20]
        print 'F: [%s] Num elements: %d ' % (f_stock, len(feature)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4]) for x in feature)
        print 'L: [%s] Num elements: %d ' % (c_stock, len(label)), ''.join('(%s,%d,%0.4f), ' % (x[0],x[1],x[4])  for x in label)
        
        # extract all the label dates
        label_trade_dates = map(lambda x: x[0], label)
        # filter feature series -  
        # example, for a record with trade date (T) 2016-02-29, expect to find a label record with date = T+1
        # if a match in the lable series couldn't be found, drop the feature record
        #
        # logic:
        # for each record in feature
        #     determine the next business date of "label" given the feature record's date
        #     if found, retrain, else, drop
        feature_filtered = filter(lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in label_trade_dates,feature)
        print 'Filtered F:[%s] Num elements: %d ' % (f_stock, len(feature_filtered)),  feature_filtered
        #
        # generate a labeledPoint (label, feature)
        label_feature = map(lambda x:(label[label_trade_dates.index(
                                                datetime2ystr(rs.after(ystr2datetime(x[0]))))
                                                ][1], x[1]), feature_filtered)
        print 'Matched Series [%s:%s] %s' % (c_stock, f_stock, ''.join('(%s,%s),' % (x[0], x[1]) for x in label_feature))
        
        save_lf_series('%s_%s' % (c_stock,f_stock), label_feature)
        
        return label_feature
    

    #features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', '^FCHI', '^FVX', '^FTSE','VNQ','QQQ','GOOG','BAC'], 'date_range': ['20150901', '20160330']}
    features_config = {'cstock': '^HSI', 'fstocks': ['^DJI', 'EUR=X', 'JPY=X'], 'date_range': ['20150901', '20160330']}
    lf = []
    for fs in features_config['fstocks']:
        lf.append(lbl_predictor_parse(features_config['cstock'], fs, features_config['date_range'][0], features_config['date_range'][1]))
                  
#     lf1 = lbl_predictor_parse('^HSI', '^DJI', '20150901', '20160325')
#     lf2 = lbl_predictor_parse('^HSI', '^FTSE', '20150901', '20160325')
#     lf3 = lbl_predictor_parse('^HSI', '^HSCE', '20150901', '20160325')
#     xx1 = np.array(map(lambda x: [x[1], 0,    0], lf1))
#     xx2 = np.array(map(lambda x: [0   , x[1] ,0], lf2))
#     xx3 = np.array(map(lambda x: [0   , 0, x[1]], lf3))
#     xx = np.concatenate((xx1, xx2, xx3))
#     #print xx
# #     yy = np.array(map(lambda x: x[0], lf1+lf2+lf3))
#     model = BernoulliNB()
#     model.fit(xx,yy)
#     scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
#     predicted = model.predict(scenarios)
#     print predicted
#     print model.predict_proba(scenarios)
#     print model.feature_count_     

    # build vector
    #[DJI, FTSE, HSCE]
    points_sp = []
    points_sk = []
    for i in range(len(lf)):
        
        def spark_friendly(v):
            # init a bunch of zeros [0,0,...]
            point = [0] * len(lf)
            # set the value at column i of the vector
            point[i] = v[1] 
            #print 'spark label:%s feature#:%d' %  (v[0], i),  point
            # retrun  a tuple of label, feature
            return (v[0], point)
        
        def sklearn_friendly(v):
            point = [0] * len(lf)
            point[i] = v[1] 
            #print 'sklearn label:%s feature#:%d' %  (v[0], i),  point
            return point
        #print 'len: ' , len(lf[i])
        points_sp.append(map(spark_friendly , lf[i]))
        points_sk.append(np.array(map(sklearn_friendly, lf[i])))
        
    #
    # format  [[(1, [1, 0, 0]), (1, [1, 0, 0])], [(0, [0, 0, 0]),...]] 
    def save_labelled_points(name, pt):
        now = datetime.datetime.now().strftime('%Y%m%d%H%M')
        now = ''
        f = open('%s/%s-%s' % ('../dat', name, now), 'w')
        
        for i in range(len(points_sp)):
            for j in range(len(points_sp[i])):
                print '%s,%s' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1]))
                f.write('%s,%s\n' % (points_sp[i][j][0], ' '.join('%d' % s for s in points_sp[i][j][1])))
                
        f.close()                
        
    print "For pyspark LabeledPoint format: ", points_sp
    save_labelled_points('%s-%s' % (features_config['cstock'], '_'.join(s for s in features_config['fstocks'])), points_sp)
    
    points_sk = np.concatenate((points_sk))
    print "For sklearn numpy format:\n ", points_sk
    #print np.concatenate((points))        
  
    #print len(lf[0]+lf[1]+lf[2]), len(reduce(lambda x,y:x+y, lf))  , len(points_sp)
    yy = np.array(map(lambda x: x[0], reduce(lambda x,y:x+y, lf)))
    model = BernoulliNB()
    model.fit(points_sk,yy)
    #scenarios = [[0,0,0], [1,1,1],[0,0,1],[0,1,1],[1,0,0],[1,1,0]]
    num_features= len(points_sk[0])
    scenarios = permutations(num_features)
    
    predicted = model.predict(scenarios)
    print predicted, scenarios
    predicted_proba = model.predict_proba(scenarios)
    print predicted_proba
    print model.feature_count_   

    print '************** SUMMARY REPORT **************'
    print 'Likelihood (%s) GIVEN (%s)' % (features_config['cstock'], ', '.join(s for s in features_config['fstocks']))
    print 'Expected\t\tResult\t\tScneario'
    
    for i in range(len(predicted)):
        print '%s:\t\t %s\t\t%s' % ('UP' if predicted[i] == 1 else 'DOWN', scenarios[i], predicted_proba[i]) 
        
def test():
    #[DJI, FTSE, HSCE]
    points = []
    for i in range(3):
        
        def f1(v):
            
            point = [0] * len(range(3))
            point[i] = v
            print i, point
            return point
        
        points.append(np.array(map(f1 , [7,8,9])))
            

    print points
    print np.concatenate((points))
  
    
def set_biz_calendar():    
    #hk holidays
    holidays = [str2datetime('20150903'),
                str2datetime('20150928'),
                str2datetime('20151225'),
                str2datetime('20151226'),
                str2datetime('20150701'),
                str2datetime('20160101'),
                str2datetime('20160208'),
                str2datetime('20160209'),
                str2datetime('20160210'),
                str2datetime('20160325'),
                str2datetime('20160326'),
                str2datetime('20160328'),
                str2datetime('20160404'),
                str2datetime('20160502')]
    
    r = rrule.rrule(rrule.DAILY, 
                    byweekday=[rrule.MO, rrule.TU, rrule.WE, rrule.TH, rrule.FR],
                    dtstart = str2datetime('20151201'))
    rs = rrule.rruleset()
    rs.rrule(r)
    for exdate in holidays:
        rs.exdate(exdate)
    
    
    return rs
    
    #print np.array(s1)
if __name__ == '__main__':
    #weather_play()
    #test()
    
    #
    # 
    # Model: 
    # 
    # What is the likelihood of HSI opens high given
    # the dow jones or some other indices closed high on 
    # the previous trading day?
    #
    rs = set_biz_calendar()
    print ''.join('%s,\n' % rs[i] for i in range(5)), rs.after(str2datetime('20160324')),\
                                                           datetime2ystr(rs.after(str2datetime('20160324')))
     
    #ewh_hsi(rs)
    predict(rs)