[Python] jsm使って株式市場のデータをスクレイピングしSQLに保存する

Python jsmを使って日本の株式市場をスクレイピング

日本の株式データをとるには、公開APIが少なく、あっても有料なものが多いので、Yahoo!Financeからスクレイピングするケースが多いらしい。

株式データを2004/1/1から今日まで取得してSQLに格納するコードを書いた。databaseファイルは82MBになった。


#!/usr/local/bin/python3
# -*- coding: utf-8 -*-

import datetime
import jsm
import sys
import sqlite3
import pickle
import os
import urllib

class Stocks():
    def __init__(self):
        self._conn = sqlite3.connect('stocks.db')
        self._cur = self._conn.cursor()
           
    def get_ccodes(self):
        '''
        return a ccodes list
        '''
        try:
            self._cur.execute('SELECT ccode FROM brand_data')
            ccodes = self._cur.fetchall()
            if len(ccodes) == 0:
                print('The table brand_data exists but has no data.')
                self.refresh_brand()
                self._cur.execute('SELECT ccode FROM brand_data') 
                ccodes = self._cur.fetchall()
        except:
            print('The table brand_data does not exist.')
            self.refresh_brand()
            self._cur.execute('SELECT ccode FROM brand_data')
            ccodes = self._cur.fetchall()
        self.ccodes = [i[0] for i in ccodes]
        return self.ccodes

    def refresh_brand(self):
        """
        update sql brand_data and set self.list_of_dict_brand
        update ccodes list
        """
        print('refreshing brand data.')
        sql = '''CREATE TABLE IF NOT EXISTS brand_data(
        ccode int, industry_code int, industry_name text, market 

text, name text, info text, datetime text, PRIMARY KEY(ccode)
        )'''

        self._cur.execute(sql)
        b = jsm.Brand()
        q = jsm.Quotes()
        IDS = b.IDS
        list_of_tuple_brand = []
        print('IDS',IDS)
        for industry_code in IDS.keys():
            try:
                industry_name = IDS[industry_code]
                brand_data = q.get_brand(industry_code)
                [list_of_tuple_brand.append((brand.ccode,
                                industry_code,
                                industry_name,
                                brand.market,
                                brand.name,
                                brand.info,
                                datetime.datetime.now()
                                )) for brand in brand_data]
                print('Industry Code: {0}'.format(industry_code))
            except:
                print('ERROR in brand data: {0}'.format

(industry_code))
        with open('brand.pickle', 'wb') as f:
            pickle.dump(list_of_tuple_brand, f)
        # Replace brand sql
        sql = '''REPLACE INTO brand_data(ccode, industry_code, 

industry_name, market, name, info, datetime) VALUES 

(?,?,?,?,?,?,?)'''
        self._cur.executemany(sql,list_of_tuple_brand)
    
    def refresh_finance(self):
        '''refresh finance information in finance_data'''
        print('refreshing finance data.')
        q = jsm.Quotes()
        sql = '''CREATE TABLE IF NOT EXISTS finance_data(
        ccode int,
        market_cap real,
        shares_issued real,
        dividend_yield real,
        dividend_one real,
        per real,
        pbr real,
        eps real,
        bps real,
        price_min real,
        round_lot real,
        years_high int,
        years_low int,
        datetime text,
        PRIMARY KEY(ccode)
        )'''
        self._cur.execute(sql)
        ccodes = self.get_ccodes()
        list_of_tuple_finance = []
        for ccode in ccodes:
            try:
                finance_data = q.get_finance(ccode)
                list_of_tuple_finance.append((ccode,
                                             finance_data.market_cap,
                                             

finance_data.shares_issued,
                                             

finance_data.dividend_yield,
                                             

finance_data.dividend_one,
                                             finance_data.per,
                                             finance_data.pbr,
                                             finance_data.eps,
                                             finance_data.bps,
                                             finance_data.price_min,
                                             finance_data.round_lot,
                                             finance_data.years_high,
                                             finance_data.years_low,
                                             datetime.datetime.now

()))
                print('ccode: {0}'.format(ccode))
            except:
                print("Error in Financial Data: ", ccode)
        sql = '''REPLACE INTO finance_data
        (ccode, market_cap, shares_issued, dividend_yield, 

dividend_one, per, pbr, eps, bps, price_min, round_lot, years_high, 

years_low, datetime)
        VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
        self._cur.executemany(sql,list_of_tuple_finance)
        
    def refresh_stocks(self, ccode):
        """
        refreshing stocks
        """
        sql = '''CREATE TABLE IF NOT EXISTS stocks{0}
        (date text, volume real, open real, high real, low real, 

close real, PRIMARY KEY(date))'''.format(ccode)
        self._cur.execute(sql)
        q = jsm.Quotes()
        try:
            with open('stocks_ts.pickle','rb') as f:
                stocks_ts = pickle.load(f)
        except:
            stocks_ts = {}

        # set periods to get stocks
        if ccode in stocks_ts.keys():
            start_date = stocks_ts[ccode] + datetime.timedelta

(days=1)
        else:
            start_date = datetime.date(2014,1,1)
        end_date = datetime.date.today()
        if start_date<=end_date:

            try:
                quote_data = q.get_historical_prices

(ccode,jsm.DAILY,start_date=start_date,end_date=end_date)
                sql = '''REPLACE INTO stocks{0}
                (date, volume, open, high, low, close) VALUES (?, ?, 

?, ?, ?, ?)'''.format(ccode)
                values = 

[(q.date,q.volume,q.open,q.high,q.low,q.close) for q in quote_data]
                self._cur.executemany(sql, values)
                # update timestamp
                stocks_ts[ccode] = datetime.datetime.strptime

(self._cur.execute('SELECT date FROM stocks{0} ORDER BY date'.format

(ccode)).fetchall()[-1][0], '%Y-%m-%d %H:%M:%S').date()
                with open('stocks_ts.pickle', 'wb') as f:
                    pickle.dump(stocks_ts, f)
                print('ccode: {0} from {1} to {2}'.format(ccode, 

start_date, end_date))
            except jsm.exceptions.CCODENotFoundException as e:
                print(ccode, ': ', e)              
            except urllib.error.HTTPError as e:
                print('ccode: {0}; HTTP ERROR CODE: {1}'.format

(ccode, e.code)) 
        else:
            print('ccode: {0} is already updated.'.format(ccode))
    def commit(self):
        self._conn.commit()
        
    def close(self):
        self._cur.close()
        self._conn.close()
        
if __name__=='__main__':
    stocks = Stocks()
    stocks.get_ccodes()
    [stocks.refresh_stocks(ccode) for ccode in stocks.ccodes]
    stocks.commit()
    stocks.close()


今後、解析用のメソッドも書いていく


参考
株価をYahoo Financeから取得する[Python]

コメント

人気の投稿