modify scripts
This commit is contained in:
@ -286,7 +286,7 @@ def fetch_performers_detail():
|
|||||||
# 获取新演员的列表
|
# 获取新演员的列表
|
||||||
while True:
|
while True:
|
||||||
if force: # 从头逐个遍历
|
if force: # 从头逐个遍历
|
||||||
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, before_updated_at='2025-04-01 00:00:00', order_by='id asc', limit=limit_count)
|
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
|
||||||
else: # 只做更新
|
else: # 只做更新
|
||||||
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
|
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
|
||||||
if len(perfomers_list) < 1:
|
if len(perfomers_list) < 1:
|
||||||
@ -315,7 +315,7 @@ def fetch_movies_detail():
|
|||||||
last_movie_id = 0
|
last_movie_id = 0
|
||||||
while True:
|
while True:
|
||||||
if force: # 从头逐个遍历
|
if force: # 从头逐个遍历
|
||||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, before_updated_at='2025-04-01 00:00:00', order_by='id asc', limit=limit_count)
|
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
|
||||||
else: # 只做更新
|
else: # 只做更新
|
||||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||||
if len(movies_list) < 1:
|
if len(movies_list) < 1:
|
||||||
@ -379,6 +379,9 @@ function_map = {
|
|||||||
def main(cmd, args_debug, args_force):
|
def main(cmd, args_debug, args_force):
|
||||||
global debug
|
global debug
|
||||||
debug = args_debug
|
debug = args_debug
|
||||||
|
if debug:
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
global force
|
global force
|
||||||
force = args_force
|
force = args_force
|
||||||
|
|||||||
@ -37,9 +37,20 @@ headers = {
|
|||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
save_raw_html = True
|
save_raw_html = True
|
||||||
|
load_from_local = True
|
||||||
|
|
||||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
|
if load_from_local: # 从本地读取的逻辑
|
||||||
|
html = utils.read_raw_html(url)
|
||||||
|
if html:
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = preprocessor(html) if preprocessor else html
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
return soup, 200
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if host_url not in url.lower():
|
if host_url not in url.lower():
|
||||||
|
|||||||
@ -343,7 +343,18 @@ def query_performer_hrefs(**filters):
|
|||||||
params.append(f"%{filters['name']}%")
|
params.append(f"%{filters['name']}%")
|
||||||
if "is_full_data" in filters:
|
if "is_full_data" in filters:
|
||||||
sql += " AND is_full_data = ?"
|
sql += " AND is_full_data = ?"
|
||||||
params.append(filters["is_full_data"])
|
if "is_full_data_in" in filters:
|
||||||
|
values = filters["is_full_data_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
if "is_full_data_not_in" in filters:
|
||||||
|
values = filters["is_full_data_not_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data NOT IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
if "before_updated_at" in filters:
|
if "before_updated_at" in filters:
|
||||||
sql += " AND updated_at <= ?"
|
sql += " AND updated_at <= ?"
|
||||||
params.append(filters["before_updated_at"])
|
params.append(filters["before_updated_at"])
|
||||||
@ -360,7 +371,7 @@ def query_performer_hrefs(**filters):
|
|||||||
sql += " limit ?"
|
sql += " limit ?"
|
||||||
params.append(filters["limit"])
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
logging.debug(f"query sql: {sql}")
|
||||||
cursor.execute(sql, params)
|
cursor.execute(sql, params)
|
||||||
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||||
@ -760,6 +771,18 @@ def query_movie_hrefs(**filters):
|
|||||||
if "is_full_data" in filters:
|
if "is_full_data" in filters:
|
||||||
sql += " AND is_full_data = ?"
|
sql += " AND is_full_data = ?"
|
||||||
params.append(filters["is_full_data"])
|
params.append(filters["is_full_data"])
|
||||||
|
if "is_full_data_in" in filters:
|
||||||
|
values = filters["is_full_data_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
|
if "is_full_data_not_in" in filters:
|
||||||
|
values = filters["is_full_data_not_in"]
|
||||||
|
if values:
|
||||||
|
placeholders = ", ".join(["?"] * len(values))
|
||||||
|
sql += f" AND is_full_data NOT IN ({placeholders})"
|
||||||
|
params.extend(values)
|
||||||
if "before_updated_at" in filters:
|
if "before_updated_at" in filters:
|
||||||
sql += " AND updated_at <= ?"
|
sql += " AND updated_at <= ?"
|
||||||
params.append(filters["before_updated_at"])
|
params.append(filters["before_updated_at"])
|
||||||
@ -776,6 +799,7 @@ def query_movie_hrefs(**filters):
|
|||||||
sql += " limit ?"
|
sql += " limit ?"
|
||||||
params.append(filters["limit"])
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
logging.debug(f"query sql: {sql}")
|
||||||
cursor.execute(sql, params)
|
cursor.execute(sql, params)
|
||||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||||
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
||||||
|
|||||||
@ -3,6 +3,7 @@ import os
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import csv
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
import config
|
import config
|
||||||
|
|
||||||
@ -117,6 +118,47 @@ def write_raw_html(href, html_text):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"发生未知错误:{e}")
|
logging.warning(f"发生未知错误:{e}")
|
||||||
|
|
||||||
|
|
||||||
|
# 保存抓取到的原始HTML,方便后续核验
|
||||||
|
def read_raw_html(href, expire_date="2025-03-01"):
|
||||||
|
# 获取目录
|
||||||
|
id = extract_id_from_href(href)
|
||||||
|
if 'person.rme' in href.lower():
|
||||||
|
dir_prefix = 'raw_performers'
|
||||||
|
elif 'title.rme' in href.lower():
|
||||||
|
dir_prefix = 'raw_movies'
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||||||
|
file_name = f"{id}.html" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(file_dir, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.path.exists(full_path):
|
||||||
|
# 获取文件的最后修改时间
|
||||||
|
last_modified_timestamp = os.path.getmtime(full_path)
|
||||||
|
# 将时间戳转换为 datetime 对象
|
||||||
|
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
|
||||||
|
# 检查文件最后修改时间是否晚于给定日期
|
||||||
|
if last_modified_date > expire_date:
|
||||||
|
logging.debug(f"find local file on href {href}")
|
||||||
|
with open(full_path, 'r', encoding='utf-8') as file:
|
||||||
|
return file.read()
|
||||||
|
else:
|
||||||
|
logging.debug(f"expired file {last_modified_date} on href {href}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||||||
|
except PermissionError:
|
||||||
|
logging.warning(f"错误:没有权限读取文件 {full_path}。")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"发生未知错误:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# 读取json文件并返回内容
|
# 读取json文件并返回内容
|
||||||
def read_json(file_path):
|
def read_json(file_path):
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user