From 0944d4f1c3e3e9fc4afdb856bb42f03d01da5bed Mon Sep 17 00:00:00 2001 From: oscarz Date: Sat, 5 Apr 2025 17:20:28 +0800 Subject: [PATCH] modify scripts --- iafd/src/fetch.py | 7 +++++-- iafd/src/iafd_scraper.py | 11 +++++++++++ iafd/src/sqlite_utils.py | 28 +++++++++++++++++++++++++-- iafd/src/utils.py | 42 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index 7a4e8b4..f356b94 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -286,7 +286,7 @@ def fetch_performers_detail(): # 获取新演员的列表 while True: if force: # 从头逐个遍历 - perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, before_updated_at='2025-04-01 00:00:00', order_by='id asc', limit=limit_count) + perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count) else: # 只做更新 perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count) if len(perfomers_list) < 1: @@ -315,7 +315,7 @@ def fetch_movies_detail(): last_movie_id = 0 while True: if force: # 从头逐个遍历 - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, before_updated_at='2025-04-01 00:00:00', order_by='id asc', limit=limit_count) + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count) else: # 只做更新 movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) if len(movies_list) < 1: @@ -379,6 +379,9 @@ function_map = { def main(cmd, args_debug, args_force): global debug debug = args_debug + if debug: + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) global force force = args_force diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py index e8c3d2f..501dbdf 100644 --- a/iafd/src/iafd_scraper.py +++ b/iafd/src/iafd_scraper.py @@ -37,9 +37,20 @@ headers = { scraper = cloudscraper.create_scraper() save_raw_html = True +load_from_local = True #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): + if load_from_local: # 从本地读取的逻辑 + html = utils.read_raw_html(url) + if html: + # 预处理 HTML(如果提供了 preprocessor) + html_text = preprocessor(html) if preprocessor else html + + soup = BeautifulSoup(html_text, parser) + if validator(soup): # 进行自定义页面检查 + return soup, 200 + for attempt in range(max_retries): try: if host_url not in url.lower(): diff --git a/iafd/src/sqlite_utils.py b/iafd/src/sqlite_utils.py index 3f0631f..b47e380 100644 --- a/iafd/src/sqlite_utils.py +++ b/iafd/src/sqlite_utils.py @@ -343,7 +343,18 @@ def query_performer_hrefs(**filters): params.append(f"%{filters['name']}%") if "is_full_data" in filters: sql += " AND is_full_data = ?" - params.append(filters["is_full_data"]) + if "is_full_data_in" in filters: + values = filters["is_full_data_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data IN ({placeholders})" + params.extend(values) + if "is_full_data_not_in" in filters: + values = filters["is_full_data_not_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data NOT IN ({placeholders})" + params.extend(values) if "before_updated_at" in filters: sql += " AND updated_at <= ?" params.append(filters["before_updated_at"]) @@ -360,7 +371,7 @@ def query_performer_hrefs(**filters): sql += " limit ?" params.append(filters["limit"]) - + logging.debug(f"query sql: {sql}") cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 返回小写 return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()] @@ -760,6 +771,18 @@ def query_movie_hrefs(**filters): if "is_full_data" in filters: sql += " AND is_full_data = ?" params.append(filters["is_full_data"]) + if "is_full_data_in" in filters: + values = filters["is_full_data_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data IN ({placeholders})" + params.extend(values) + if "is_full_data_not_in" in filters: + values = filters["is_full_data_not_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data NOT IN ({placeholders})" + params.extend(values) if "before_updated_at" in filters: sql += " AND updated_at <= ?" params.append(filters["before_updated_at"]) @@ -776,6 +799,7 @@ def query_movie_hrefs(**filters): sql += " limit ?" params.append(filters["limit"]) + logging.debug(f"query sql: {sql}") cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()] diff --git a/iafd/src/utils.py b/iafd/src/utils.py index 6bad9f6..59aeaae 100644 --- a/iafd/src/utils.py +++ b/iafd/src/utils.py @@ -3,6 +3,7 @@ import os import json import time import csv +from datetime import datetime import logging import config @@ -117,6 +118,47 @@ def write_raw_html(href, html_text): except Exception as e: logging.warning(f"发生未知错误:{e}") + +# 保存抓取到的原始HTML,方便后续核验 +def read_raw_html(href, expire_date="2025-03-01"): + # 获取目录 + id = extract_id_from_href(href) + if 'person.rme' in href.lower(): + dir_prefix = 'raw_performers' + elif 'title.rme' in href.lower(): + dir_prefix = 'raw_movies' + else: + return None + + file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) + file_name = f"{id}.html" # 用 - 替换空格 + full_path = os.path.join(file_dir, file_name) + + try: + if os.path.exists(full_path): + # 获取文件的最后修改时间 + last_modified_timestamp = os.path.getmtime(full_path) + # 将时间戳转换为 datetime 对象 + last_modified_date = datetime.fromtimestamp(last_modified_timestamp) + # 检查文件最后修改时间是否晚于给定日期 + if last_modified_date > expire_date: + logging.debug(f"find local file on href {href}") + with open(full_path, 'r', encoding='utf-8') as file: + return file.read() + else: + logging.debug(f"expired file {last_modified_date} on href {href}") + return None + else: + return None + except FileNotFoundError: + logging.warning(f"错误:指定的路径 {full_path} 不存在。") + except PermissionError: + logging.warning(f"错误:没有权限读取文件 {full_path}。") + except Exception as e: + logging.warning(f"发生未知错误:{e}") + return None + + # 读取json文件并返回内容 def read_json(file_path): try: