From a0f3c18d083cb3a4be46f3c2fc4a1d918b59e744 Mon Sep 17 00:00:00 2001 From: chenwj113 Date: Thu, 13 Oct 2022 14:34:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0lottery=E5=9F=BA=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +- get_data.py | 51 +++++++++++++++++++++ get_lottery_data.py | 106 -------------------------------------------- lottery.py | 75 +++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 107 deletions(-) create mode 100644 get_data.py delete mode 100644 get_lottery_data.py create mode 100644 lottery.py diff --git a/.gitignore b/.gitignore index b11ec0c..d003fec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ */__pycache__ -sample \ No newline at end of file +sample +__pycache__ \ No newline at end of file diff --git a/get_data.py b/get_data.py new file mode 100644 index 0000000..7857b9c --- /dev/null +++ b/get_data.py @@ -0,0 +1,51 @@ +from requests_html import HTMLSession, HTML +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from retrying import retry + +from lottery import Lottery + + +session = HTMLSession() + +@retry(stop_max_attempt_number=3) +def _get_data(url, lottery_type): + lottery = Lottery(lottery_type=lottery_type) + r = session.get(url) + table_list = r.html.find("table.kj_tablelist02", first=True) + draw_issue = table_list.find("td.td_title01 span.span_left strong")[0].text + draw_date = table_list.find("td.td_title01 span.span_right")[0].text + draw_code = table_list.find("div.ball_box01")[0].text.replace('\n', '') + last_id = lottery.insert(draw_issue, draw_date, draw_code) + if last_id: + print(f"issue:{issue}数据写入完成。。。") + else: + print(f'issue:{issue}已经存在') + +def get_data(url, lottery_type): + try: + _get_data(url, lottery_type) + except Exception as e: + print(e) + print('异常出错重试后,依然报错') + raise e + +def main(basic_url, lottery_type): + """爬取相关数据""" + r = session.get(basic_url) + select_list = list(reversed(r.html.find('div.kjxq_box02_title_right span div a'))) + for item in select_list: + html = HTML(html=item.html) + url = html.find('a', first=True).attrs['href'] + try: + get_data(url, lottery_type) + except Exception as e: + print(e) + continue + +if __name__ == '__main__': + url = "https://kaijiang.500.com/shtml/pls/22265.shtml" + # url = "https://kaijiang.500.com/shtml/sd/04001.shtml" + # url = "https://kaijiang.500.com/shtml/plw/04001.shtml" + # main(url, lottery_type='plw') + get_data(url, lottery_type= 'pls') \ No newline at end of file diff --git a/get_lottery_data.py b/get_lottery_data.py deleted file mode 100644 index 44af842..0000000 --- a/get_lottery_data.py +++ /dev/null @@ -1,106 +0,0 @@ -from requests_html import HTMLSession, HTML -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -import re -import time -import random -from datetime import date -from retrying import retry - -from model.pls import PLS -from model.plw import PLW -from model.sd import SD -from model.klb import KLB - -engine = create_engine("mysql+pymysql://root:123456@localhost/lottery?charset=utf8") -DbSession = sessionmaker(bind=engine) -db = DbSession() - -session = HTMLSession() -pat1 = re.compile('开奖日期:(\d+)年(\d+)月(\d+)日.*') -pat2 = re.compile('开奖日期:(\d+)-(\d+)-(\d+)\s.*') - - -@retry(stop_max_attempt_number=3) -def _get_data(url, lottery_type): - if lottery_type.lower() == 'pls': - Model = PLS - elif lottery_type.lower() == 'sd': - Model = SD - elif lottery_type.lower() == 'plw': - Model = PLW - else: - Model = KLB - # 爬取数据 - r = session.get(url) - table_list = r.html.find("table.kj_tablelist02", first=True) - issue = table_list.find("td.td_title01 span.span_left strong")[0].text - open_date = table_list.find("td.td_title01 span.span_right")[0].text - draw_code = table_list.find("div.ball_box01")[0].text.replace('\n', '') - m1 = re.match(pat1, open_date) - m2 = re.match(pat2, open_date) - # 查询数据库记录 - result = db.query(Model).filter_by(draw_issue=issue).first() - if result is None: - record = Model() - record.draw_issue = issue - record.draw_code = draw_code - if m1 or m2 : - if m1: - record.draw_date = date(int(m1.group(1)), int(m1.group(2)), int(m1.group(3))) - else: - record.draw_date = date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))) - else: - raise Exception(f"issue:{issue}数据写入失败。。。") - # 如果是排列3和3D - if isinstance(record, PLS) or isinstance(record, SD): - record.hundred = draw_code[0] - record.ten = draw_code[1] - record.one = draw_code[2] - record.code_small = len(list(filter(lambda x: True if int(x) < 5 else False, draw_code))) - record.code_big = len(list(filter(lambda x: True if int(x) >= 5 else False, draw_code))) - record.code_single = len(list(filter(lambda x: True if int(x) % 2 == 1 else False, draw_code))) - record.code_double = len(list(filter(lambda x: True if int(x) % 2 == 0 else False, draw_code))) - record.draw_code = draw_code - record.sum_num = sum(map(int, draw_code)) - record.sum_hundred_one = int(draw_code[2]) + int(draw_code[0]) - record.sum_hundred_ten = int(draw_code[2]) + int(record.draw_code[1]) - record.sum_ten_one = int(record.draw_code[1]) + int(record.draw_code[0]) - if len(set(draw_code)) == 2: - record.group_type = 3 - elif len(set(draw_code)) == 3: - record.group_type = 6 - else: - record.group_type = 1 - db.add(record) - db.commit() - print(f"issue:{issue}数据写入完成。。。") - else: - print(f'issue:{issue}已经存在') - -def get_data(url, lottery_type): - try: - _get_data(url, lottery_type) - except Exception as e: - print(e) - print('异常出错重试后,依然报错') - raise e - -def main(basic_url, lottery_type): - """爬取相关数据""" - r = session.get(basic_url) - select_list = list(reversed(r.html.find('div.kjxq_box02_title_right span div a'))) - for item in select_list: - html = HTML(html=item.html) - url = html.find('a', first=True).attrs['href'] - try: - get_data(url, lottery_type) - except Exception as e: - print(e) - continue - -if __name__ == '__main__': - # https://kaijiang.500.com/shtml/pls/04001.shtml - # https://kaijiang.500.com/shtml/sd/04001.shtml - basic_url = "https://kaijiang.500.com/shtml/plw/04001.shtml" - main(basic_url, lottery_type='plw') \ No newline at end of file diff --git a/lottery.py b/lottery.py new file mode 100644 index 0000000..4256d49 --- /dev/null +++ b/lottery.py @@ -0,0 +1,75 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +import re +from datetime import date + +from model.pls import PLS +from model.plw import PLW +from model.sd import SD +from model.klb import KLB + +class Lottery(object): + """ + Lottery Base Object + """ + def __init__(self, lottery_type='pls'): + self._pat1 = re.compile('开奖日期:(\d+)年(\d+)月(\d+)日.*') + self._pat2 = re.compile('开奖日期:(\d+)-(\d+)-(\d+)\s.*') + self._lottery_type = lottery_type + self.db = self._get_db_session() + if lottery_type.lower() == 'pls': + self._Model = PLS + elif lottery_type.lower() == 'sd': + self._Model = SD + elif lottery_type.lower() == 'plw': + self._Model = PLW + elif lottery_type.lower() == 'klb': + self._Model = KLB + else: + raise Exception("未知的lottery_type") + + def _get_db_session(self): + _engine = create_engine("mysql+pymysql://root:123456@localhost/lottery?charset=utf8") + _DbSession = sessionmaker(bind=_engine) + return _DbSession() + + def insert(self, draw_issue, draw_date, draw_code, **kwargs): + result = self.db.query(self._Model).filter_by(draw_issue=draw_issue).first() + if result is None: + record = self._Model() + record.draw_issue = draw_issue + record.draw_code = draw_code + m1 = re.match(self._pat1, draw_date) + m2 = re.match(self._pat2, draw_date) + if m1 or m2 : + if m1: + record.draw_date = date(int(m1.group(1)), int(m1.group(2)), int(m1.group(3))) + else: + record.draw_date = date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))) + else: + raise Exception(f"issue:{issue}数据写入失败。。。") + # 如果是排列3和3D + if isinstance(record, PLS) or isinstance(record, SD): + record.hundred = draw_code[0] + record.ten = draw_code[1] + record.one = draw_code[2] + record.code_small = len(list(filter(lambda x: True if int(x) < 5 else False, draw_code))) + record.code_big = len(list(filter(lambda x: True if int(x) >= 5 else False, draw_code))) + record.code_single = len(list(filter(lambda x: True if int(x) % 2 == 1 else False, draw_code))) + record.code_double = len(list(filter(lambda x: True if int(x) % 2 == 0 else False, draw_code))) + record.draw_code = draw_code + record.sum_num = sum(map(int, draw_code)) + record.sum_hundred_one = int(draw_code[2]) + int(draw_code[0]) + record.sum_hundred_ten = int(draw_code[2]) + int(record.draw_code[1]) + record.sum_ten_one = int(record.draw_code[1]) + int(record.draw_code[0]) + if len(set(draw_code)) == 2: + record.group_type = 3 + elif len(set(draw_code)) == 3: + record.group_type = 6 + else: + record.group_type = 1 + self.db.add(record) + self.db.commit() + return record.id + else: + return result.id \ No newline at end of file