In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import indicators as indi
In [2]:
csvpath = 'R:/WORK/DAT_ASCII_USDJPY_M1_2016.csv'
df = pd.read_csv(csvpath, sep=';', names=('Time','Open','High','Low','Close', ''),
                 index_col='Time', parse_dates=True)['Open High Low Close'.split()]
df.index += pd.offsets.Hour(7)
df.head()
Out[2]:
Open High Low Close
Time
2016-01-04 00:00:00 120.172 120.205 120.171 120.205
2016-01-04 00:02:00 120.202 120.210 120.202 120.210
2016-01-04 00:04:00 120.212 120.212 120.211 120.211
2016-01-04 00:05:00 120.212 120.212 120.206 120.209
2016-01-04 00:06:00 120.206 120.218 120.206 120.218

Openのデータが1ティック目の価格のようなデータの場合は
Openを以下のようにしたほうがちょうど1分という意味では正しいか

In [3]:
# df['Open'] = df['Close'].shift()
# df.dropna(inplace=True)
# df.head()
In [ ]:
 

前回の記事で書いたのはこれだった

これには簡単に改善できるのにちょっと遅い部分があった
それは、シグナル計算の部分で shift() を使ったときに
先頭にnanが入って型がObject型になってしまって
calc_result()の(df['Hsig']&(df['ans']==1)のような、
boolならもっと高速に計算できる部分のせいのようだった
(たぶん おそらく (´・ω・`))

In [13]:
def calc_answer(df, draw_range=0.0):
    df['ans'] = 0
    df['ans'] = df['ans'].mask(df['Close']>df['Open']+draw_range, 1)
    df['ans'] = df['ans'].mask(df['Close']<df['Open']-draw_range, -1)
    return df

def calc_rsi_signal(df, period=14, leveldiff=20):
    rsi = indi.iRSI(df, period)
    df['Hsig'] = (rsi<50-leveldiff).shift()
    df['Lsig'] = (rsi>50+leveldiff).shift()
    return df

def calc_result(df):
    df['result'] = None
    df['result'] = df['result'].mask((df['Hsig']&(df['ans']==1)) | (df['Lsig']&(df['ans']==-1)), 1)
    df['result'] = df['result'].mask((df['Hsig']&(df['ans']!=1)) | (df['Lsig']&(df['ans']!=-1)), -1)
    df['result'] = df['result'].mask((df['Hsig']|df['Lsig']) & (df['ans']==0), 0)
    return df

def summary(df):
    bars = len(df)
    n = len(df['result'].dropna())
    win  = (df['result']==1).sum()
    loss = (df['result']==-1).sum()
    draw = (df['result']==0).sum()
    print('bars : {}'.format(bars))
    print('N    : {}'.format(n))
    print('win  : {}({:.2f}%)({:.2f}%)'.format(win, win/n*100, win/(win+loss)*100))
    print('loss : {}({:.2f}%)({:.2f}%)'.format(loss, loss/n*100, loss/(win+loss)*100))
    print('draw : {}({:.2f}%)'.format(draw, draw/n*100))
    print('total: {}'.format(win-loss))

def plot(df, payout=1):
    df['result'].mask(df['result']==1, payout).dropna().cumsum().plot(figsize=(15,5))

%time calc_answer(df)
%time calc_rsi_signal(df)
%time calc_result(df)
summary(df)
plot(df, 0.8)
Wall time: 19 ms
Wall time: 46 ms
Wall time: 350 ms
bars : 372417
N    : 31419
win  : 16960(53.98%)(55.90%)
loss : 13382(42.59%)(44.10%)
draw : 1077(3.43%)
total: 3578

↓こいつらです(´・ω・`)

In [5]:
print(df['Hsig'].dtypes)
print(df['Lsig'].dtypes)

print('↓こういうことなんですね')
s = pd.Series(np.ones(3, dtype=bool))
print(s.dtypes)
print(s.shift().dtypes)
print(s.shift().fillna(False).dtypes)
object
object
↓こういうことなんですね
bool
object
bool

この部分をbool型にしてやってみたら少し速くなった

calc_result()で集計前にnanをFalseで埋めた

まえのコードだと350msくらいかかってた箇所が
80msくらいになったヾ(´・ω・`)ノ゙

それとついでにspread HighLowのルールを勘違いしていたようなので
calc_answerで答えの配列を計算する部分も直した
(とはいえルールそんなに細かく調べてない(´・ω・`))

In [6]:
def calc_answer(df, spread=0.000):
    df['ansH'] = 0
    df['ansH'] = df['ansH'].mask(df['Close']>df['Open']+spread,  1)
    df['ansH'] = df['ansH'].mask(df['Close']<df['Open']+spread, -1)
    df['ansL'] = 0
    df['ansL'] = df['ansL'].mask(df['Close']<df['Open']-spread,  1)
    df['ansL'] = df['ansL'].mask(df['Close']>df['Open']-spread, -1)
    return df

def calc_rsi_signal(df, period=14, leveldiff=20):
    rsi = indi.iRSI(df, period)
    df['Hsig'] = (rsi<50-leveldiff).shift()
    df['Lsig'] = (rsi>50+leveldiff).shift()
    return df

def calc_result(df):
    df['Hsig'] = df['Hsig'].fillna(False)
    df['Lsig'] = df['Lsig'].fillna(False)
    df['result'] = None
    df['result'] = df['result'].mask((df['Hsig']&(df['ansH']== 1)) | (df['Lsig']&(df['ansL']== 1)),  1)
    df['result'] = df['result'].mask((df['Hsig']&(df['ansH']==-1)) | (df['Lsig']&(df['ansL']==-1)), -1)
    df['result'] = df['result'].mask((df['Hsig']&(df['ansH']== 0)) | (df['Lsig']&(df['ansL']== 0)),  0)
    return df

def summary(df):
    bars = len(df)
    n = len(df['result'].dropna())
    win  = (df['result']==1).sum()
    loss = (df['result']==-1).sum()
    draw = (df['result']==0).sum()
    print('bars : {}'.format(bars))
    print('N    : {}'.format(n))
    print('win  : {}({:.2f}%)({:.2f}%)'.format(win, win/n*100, win/(win+loss)*100))
    print('loss : {}({:.2f}%)({:.2f}%)'.format(loss, loss/n*100, loss/(win+loss)*100))
    print('draw : {}({:.2f}%)'.format(draw, draw/n*100))
    print('total: {}'.format(win-loss))

def plot(df, payout=1):
    df['result'].dropna().astype(float).mask(df['result']==1, payout).cumsum().plot(figsize=(15,5))


%time calc_answer(df, 0.000)
%time calc_rsi_signal(df)
%time calc_result(df)
summary(df)
plot(df, 0.8)
Wall time: 34 ms
Wall time: 48 ms
Wall time: 77 ms
bars : 372417
N    : 31419
win  : 16960(53.98%)(55.90%)
loss : 13382(42.59%)(44.10%)
draw : 1077(3.43%)
total: 3578

主にnumpyでやるようにしたらもう少し速くなるかと思って試したら

速くなった(´・ω・`)

シグナルもshiftせず、calcresutl()で
self.Hsig = np.r
[[False], self.Hsig[:-1]]
こんなかんじで先頭にFalseをいれてHsigの[:-1]までを繋いだものを用意するようにした
(ここでself.Hsigはnumpy.ndarrayになってる)

In [7]:
class BOTester(object):
    def __init__(self, df, spread=0.000):
        self.df = df
        self.spread = spread
        self.Close = self.df['Close'].values
        self.Open  = self.df['Open'].values
        self.NO_ENTRY = -2
    
    def calc_answer(self):
        self.ansH = np.zeros(len(self.df))
        self.ansH = np.where(self.Close>self.Open+self.spread,  1, self.ansH)
        self.ansH = np.where(self.Close<self.Open+self.spread, -1, self.ansH)
        self.ansL = np.zeros(len(self.df))
        self.ansL = np.where(self.Close<self.Open-self.spread,  1, self.ansL)
        self.ansL = np.where(self.Close>self.Open-self.spread, -1, self.ansL)

    def calc_rsi_signal(self, period=14, leveldiff=20):
        rsi = indi.iRSI(self.df, period)
        self.Hsig = (rsi<50-leveldiff)
        self.Lsig = (rsi>50+leveldiff)

    def calc_result(self):
        self.Hsig = np.r_[[False], self.Hsig[:-1]]
        self.Lsig = np.r_[[False], self.Lsig[:-1]]
        self.result = np.full(len(self.df), self.NO_ENTRY, dtype=np.int64)
        self.result = np.where((self.Hsig&(self.ansH== 1))|(self.Lsig&(self.ansL== 1)),  1, self.result)
        self.result = np.where((self.Hsig&(self.ansH==-1))|(self.Lsig&(self.ansL==-1)), -1, self.result)
        self.result = np.where((self.Hsig&(self.ansH== 0))|(self.Lsig&(self.ansL== 0)),  0, self.result)
        self.result = pd.Series(self.result, index=self.df.index)
        self.result = self.result[self.result!=self.NO_ENTRY]

    def summary(self):
        bars = len(self.df)
        n = len(self.result)
        win  = (self.result==1).sum()
        loss = (self.result==-1).sum()
        draw = (self.result==0).sum()
        print('bars : {}'.format(bars))
        print('N    : {}'.format(n))
        print('win  : {}({:.2f}%)({:.2f}%)'.format(win, win/n*100, win/(win+loss)*100))
        print('loss : {}({:.2f}%)({:.2f}%)'.format(loss, loss/n*100, loss/(win+loss)*100))
        print('draw : {}({:.2f}%)'.format(draw, draw/n*100))
        print('total: {}'.format(win-loss))

    def plot(self, payout=1):
        self.result.mask(self.result==1, payout).cumsum().plot(figsize=(15,5))

%time bt = BOTester(df, 0.000)
%time bt.calc_answer()
%time bt.calc_rsi_signal()
%time bt.calc_result()
bt.summary()
bt.plot(0.8)
Wall time: 1e+03 µs
Wall time: 16 ms
Wall time: 20 ms
Wall time: 9 ms
bars : 372417
N    : 31419
win  : 16960(53.98%)(55.90%)
loss : 13382(42.59%)(44.10%)
draw : 1077(3.43%)
total: 3578

全体で50msくらいで終わるようになった(printとplotを除く)ヾ(´・ω・`)ノ゙

calc_answerも少し速くなった
calc_rsi_signalはshiftしないぶん速くなった
calc_resultも速くなった(df.maskじゃなくてnp.whereにしたからか?)

np.whereとSeries.maskだと差があるかな?

In [10]:
a = np.arange(3000000)
%timeit b = np.where(a%2==0, -1, a)
10 loops, best of 3: 20.9 ms per loop
In [11]:
a = pd.Series(np.arange(3000000))
%timeit b = np.where(a%2==0, -1, a)
10 loops, best of 3: 21.6 ms per loop
In [12]:
a = pd.Series(np.arange(3000000))
%timeit b = a.mask(a%2==0, -1)
10 loops, best of 3: 31.7 ms per loop

numpy配列でもSeriesでもnp.whereを使ったときの速度差はない感じですが
maskは少し時間がかかるっぽいですね
indexを保持しなくていいならnp.whereが速くていいか

とはいえ繰り返してやる処理でないならそんなに気にしないですけどね(´・ω・`)