From 0944d4f1c3e3e9fc4afdb856bb42f03d01da5bed Mon Sep 17 00:00:00 2001
From: oscarz <oscar@vip.qq.com>
Date: Sat, 5 Apr 2025 17:20:28 +0800
Subject: [PATCH] modify scripts

---
 iafd/src/fetch.py        |  7 +++++--
 iafd/src/iafd_scraper.py | 11 +++++++++++
 iafd/src/sqlite_utils.py | 28 +++++++++++++++++++++++++--
 iafd/src/utils.py        | 42 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py
index 7a4e8b4..f356b94 100644
--- a/iafd/src/fetch.py
+++ b/iafd/src/fetch.py
@@ -286,7 +286,7 @@ def fetch_performers_detail():
     # 获取新演员的列表
     while True:        
         if force:   # 从头逐个遍历
-            perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, before_updated_at='2025-04-01 00:00:00', order_by='id asc', limit=limit_count)
+            perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
         else:       # 只做更新
             perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
         if len(perfomers_list) < 1:
@@ -315,7 +315,7 @@ def fetch_movies_detail():
     last_movie_id = 0
     while True:
         if force:   # 从头逐个遍历
-            movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, before_updated_at='2025-04-01 00:00:00', order_by='id asc', limit=limit_count)
+            movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
         else:       # 只做更新
             movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
         if len(movies_list) < 1:
@@ -379,6 +379,9 @@ function_map = {
 def main(cmd, args_debug, args_force):
     global debug
     debug = args_debug
+    if debug:
+        logger = logging.getLogger()
+        logger.setLevel(logging.DEBUG)
 
     global force
     force = args_force
diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py
index e8c3d2f..501dbdf 100644
--- a/iafd/src/iafd_scraper.py
+++ b/iafd/src/iafd_scraper.py
@@ -37,9 +37,20 @@ headers = {
 scraper = cloudscraper.create_scraper()
 
 save_raw_html = True
+load_from_local = True
 
 #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    if load_from_local:     # 从本地读取的逻辑
+        html = utils.read_raw_html(url)
+        if html:
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(html) if preprocessor else html
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup, 200
+
     for attempt in range(max_retries):
         try:
             if host_url not in url.lower():
diff --git a/iafd/src/sqlite_utils.py b/iafd/src/sqlite_utils.py
index 3f0631f..b47e380 100644
--- a/iafd/src/sqlite_utils.py
+++ b/iafd/src/sqlite_utils.py
@@ -343,7 +343,18 @@ def query_performer_hrefs(**filters):
             params.append(f"%{filters['name']}%")
         if "is_full_data" in filters:
             sql += " AND is_full_data = ?"
-            params.append(filters["is_full_data"])
+        if "is_full_data_in" in filters:
+            values = filters["is_full_data_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data IN ({placeholders})"
+                params.extend(values)
+        if "is_full_data_not_in" in filters:
+            values = filters["is_full_data_not_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data NOT IN ({placeholders})"
+                params.extend(values)
         if "before_updated_at" in filters:
             sql += " AND updated_at <= ?"
             params.append(filters["before_updated_at"])
@@ -360,7 +371,7 @@ def query_performer_hrefs(**filters):
             sql += " limit ?"
             params.append(filters["limit"])
 
-
+        logging.debug(f"query sql: {sql}")
         cursor.execute(sql, params)
         #return [row[0].lower() for row in cursor.fetchall()]   # 返回小写
         return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
@@ -760,6 +771,18 @@ def query_movie_hrefs(**filters):
         if "is_full_data" in filters:
             sql += " AND is_full_data = ?"
             params.append(filters["is_full_data"])
+        if "is_full_data_in" in filters:
+            values = filters["is_full_data_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data IN ({placeholders})"
+                params.extend(values)
+        if "is_full_data_not_in" in filters:
+            values = filters["is_full_data_not_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data NOT IN ({placeholders})"
+                params.extend(values)
         if "before_updated_at" in filters:
             sql += " AND updated_at <= ?"
             params.append(filters["before_updated_at"])
@@ -776,6 +799,7 @@ def query_movie_hrefs(**filters):
             sql += " limit ?"
             params.append(filters["limit"])
 
+        logging.debug(f"query sql: {sql}")
         cursor.execute(sql, params)
         #return [row[0].lower() for row in cursor.fetchall()]    # 链接使用小写
         return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
diff --git a/iafd/src/utils.py b/iafd/src/utils.py
index 6bad9f6..59aeaae 100644
--- a/iafd/src/utils.py
+++ b/iafd/src/utils.py
@@ -3,6 +3,7 @@ import os
 import json
 import time
 import csv
+from datetime import datetime
 import logging
 import config
 
@@ -117,6 +118,47 @@ def write_raw_html(href, html_text):
     except Exception as e:
         logging.warning(f"发生未知错误：{e}")
 
+
+# 保存抓取到的原始HTML，方便后续核验
+def read_raw_html(href, expire_date="2025-03-01"):
+    # 获取目录
+    id = extract_id_from_href(href)
+    if 'person.rme' in href.lower():
+        dir_prefix = 'raw_performers'
+    elif 'title.rme' in href.lower():
+        dir_prefix = 'raw_movies'
+    else:
+        return None
+
+    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
+    file_name = f"{id}.html"  # 用 - 替换空格
+    full_path = os.path.join(file_dir, file_name)
+
+    try:
+        if os.path.exists(full_path):
+            # 获取文件的最后修改时间
+            last_modified_timestamp = os.path.getmtime(full_path)
+            # 将时间戳转换为 datetime 对象
+            last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
+            # 检查文件最后修改时间是否晚于给定日期
+            if last_modified_date > expire_date:
+                logging.debug(f"find local file on href {href}")
+                with open(full_path, 'r', encoding='utf-8') as file:
+                    return file.read()
+            else:
+                logging.debug(f"expired file {last_modified_date} on href {href}")
+                return None
+        else:
+            return None
+    except FileNotFoundError:
+        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
+    except PermissionError:
+        logging.warning(f"错误：没有权限读取文件 {full_path}。")
+    except Exception as e:
+        logging.warning(f"发生未知错误：{e}")
+    return None
+
+
 # 读取json文件并返回内容
 def read_json(file_path):
     try: