diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index e02eb8d..c1a68cc 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -262,8 +262,11 @@ def fetch_performers_detail_once(perfomers_list): else: logging.warning(f'parse_page_performer error. person: ({person}), url: {url}') elif status_code and status_code == 404: - performer_id = db_tools.insert_or_update_performer_404(name=person, href=url) + performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=2) logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...') + elif status_code and status_code == 601: + performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=3) + logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...') else: logging.warning(f'fetch_page error. person: ({person}), url: {url}') time.sleep(1) @@ -285,8 +288,8 @@ def fetch_performers_detail(): if debug: break - # 获取待更新的演员的列表 - while True: + # 获取待更新的演员的列表,这个对账目前做的还有点问题 + while False: perfomers_list = db_tools.get_performers_needed_update(limit=limit_count) if len(perfomers_list) < 1: logging.info(f'all existed performers updated. ') @@ -334,8 +337,12 @@ def fetch_movies_detail(): logging.warning(f'parse_page_movie error. url: {url}') elif status_code and status_code == 404: # 标记为已处理 - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url) + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2) logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') + elif status_code and status_code == 601: + # 标记为已处理 + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3) + logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') time.sleep(1) @@ -394,9 +401,10 @@ def main(cmd, args_debug, args_force): db_tools.finalize_task_log(task_id) # TODO: - # 1, movies 更新之后,要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取 - # 2, distributors 和 studios 对movie列表的互相检验 - # 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的 + # 1, 演员列表中的影片数量,与电影列表中聚合出来的影片数量,可能不同。一个原因是某个影片有多个导演,且导演又兼了演员。比如: + # https://www.iafd.com/title.rme/id=0f79d81f-25ff-40d1-967a-24b99f03b79a + # https://www.iafd.com/person.rme/id=37efc86d-fefe-436d-8e3e-2e04b4e6565c + # 目前的movie表保存导演信息有遗漏。需要调整 if __name__ == "__main__": # 命令行参数处理 diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py index ac1f096..bcedf19 100644 --- a/iafd/src/iafd_scraper.py +++ b/iafd/src/iafd_scraper.py @@ -12,6 +12,7 @@ from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial import config +import utils # 定义基础 URL 和可变参数 host_url = "https://www.iafd.com" @@ -35,13 +36,15 @@ headers = { } scraper = cloudscraper.create_scraper() +save_raw_html = True + #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): for attempt in range(max_retries): try: if host_url not in url.lower(): logging.error(f'wrong url format: {url}') - return None, None + return None, 601 response = scraper.get(url, headers=headers) @@ -57,6 +60,9 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor logging.debug(f"invalid or outdated page: {url}") return None, 404 # 直接返回 404,调用方可以跳过 + if save_raw_html: + utils.write_raw_html(url, response.text) + # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(response.text) if preprocessor else response.text diff --git a/iafd/src/sqlite_utils.py b/iafd/src/sqlite_utils.py index d2c721a..acd3161 100644 --- a/iafd/src/sqlite_utils.py +++ b/iafd/src/sqlite_utils.py @@ -253,17 +253,17 @@ def insert_or_update_performer(data): # """插入或更新电影数据(异常url的处理,比如404链接)""" -def insert_or_update_performer_404(name, href): +def insert_or_update_performer_404(name, href, is_full_data=1): try: cursor.execute(""" INSERT INTO iafd_performers (href, name, is_full_data, updated_at) - VALUES (?, ?, 1, datetime('now', 'localtime')) + VALUES (?, ?, ?, datetime('now', 'localtime')) ON CONFLICT(href) DO UPDATE SET name = excluded.name, - is_full_data = 1, + is_full_data = excluded.is_full_data, updated_at = datetime('now', 'localtime') """, ( - href, name + href, name, is_full_data )) # 获取 performer_id @@ -645,17 +645,17 @@ def insert_or_update_movie(movie_data): # """插入或更新电影数据(异常url的处理,比如404链接)""" -def insert_or_update_movie_404(title, href): +def insert_or_update_movie_404(title, href, is_full_data=1): try: # 插入或更新电影信息 cursor.execute( """ INSERT INTO iafd_movies (title, href, is_full_data, updated_at) - VALUES (?, ?, 1, datetime('now', 'localtime')) + VALUES (?, ?, ?, datetime('now', 'localtime')) ON CONFLICT(href) DO UPDATE SET - title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime') + title=excluded.title, is_full_data=excluded.is_full_data, updated_at = datetime('now', 'localtime') """, - (title, href) + (title, href, is_full_data) ) conn.commit() @@ -761,6 +761,85 @@ def get_performers_needed_update(limit=None): logging.error(f"查询 href 失败: {e}") return [] +# 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。 +def create_view_and_indexes(): + try: + # 检查索引是否存在,如果不存在则创建 + indexes = [ + ("idx_iafd_performers_movies_performer_id", + "CREATE INDEX idx_iafd_performers_movies_performer_id ON iafd_performers_movies (performer_id);"), + ("idx_iafd_movies_director_id", + "CREATE INDEX idx_iafd_movies_director_id ON iafd_movies (director_id);"), + ("idx_iafd_performers_id", + "CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);") + ] + for index_name, create_index_sql in indexes: + cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,)) + if not cursor.fetchone(): + cursor.execute(create_index_sql) + logging.info(f"Index {index_name} created successfully.") + else: + logging.info(f"Index {index_name} already exists.") + + # 检查视图是否存在,如果不存在则创建 + view_name = "view_perfomers_cnt" + cursor.execute("SELECT name FROM sqlite_master WHERE type='view' AND name=?", (view_name,)) + if not cursor.fetchone(): + create_view_sql = """ + CREATE VIEW view_perfomers_cnt AS + SELECT + id, + href, + name, + movies_cnt, + SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count, + SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count + FROM ( + SELECT + p.id, + p.href, + p.name, + p.movies_cnt, + COUNT(apm.movie_id) AS movie_count, + 'actor' AS role + FROM + iafd_performers p + LEFT JOIN + iafd_performers_movies apm ON p.id = apm.performer_id + GROUP BY + p.id, p.href, p.name, p.movies_cnt + + UNION ALL + + SELECT + p.id, + p.href, + p.name, + p.movies_cnt, + COUNT(im.id) AS movie_count, + 'director' AS role + FROM + iafd_performers p + LEFT JOIN + iafd_movies im ON p.id = im.director_id + GROUP BY + p.id, p.href, p.name, p.movies_cnt + ) combined + GROUP BY + id, href, name, movies_cnt; + """ + cursor.execute(create_view_sql) + logging.info(f"View {view_name} created successfully.") + else: + logging.info(f"View {view_name} already exists.") + + # 提交更改并关闭连接 + conn.commit() + except sqlite3.Error as e: + logging.warning(f"An error occurred: {e}") + + + # 插入一条任务日志 def insert_task_log(): try: @@ -834,7 +913,9 @@ def finalize_task_log(task_id): logging.error(f"任务 {task_id} 结束失败: {e}") if __name__ == "__main__": + create_view_and_indexes() + ''' try: with open('../result/detail.json', 'r') as file: performers = json.load(file) @@ -845,4 +926,6 @@ if __name__ == "__main__": #delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34") print(query_performer_hrefs()) except FileNotFoundError: - logging.info("detail.json not found, starting fresh.") \ No newline at end of file + logging.info("detail.json not found, starting fresh.") + + ''' \ No newline at end of file diff --git a/iafd/src/utils.py b/iafd/src/utils.py index 6213d08..9c2fbdd 100644 --- a/iafd/src/utils.py +++ b/iafd/src/utils.py @@ -87,6 +87,30 @@ def write_movie_json(href, data): except Exception as e: logging.error(f"Error writing file {full_path}: {e}") +# 保存抓取到的原始HTML,方便后续核验 +def write_raw_html(href, html_text): + # 获取目录 + id = extract_id_from_href(href) + if 'person.rme' in href.lower(): + dir_prefix = 'raw_performers' + elif 'title.rme' in href.lower(): + dir_prefix = 'raw_movies' + else: + return + + file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) + file_name = f"{id}.html" # 用 - 替换空格 + full_path = os.path.join(file_dir, file_name) + + try: + with open(full_path, 'w', encoding='utf-8') as file: + file.write(html_text) + except FileNotFoundError: + logging.warning(f"错误:指定的路径 {full_path} 不存在。") + except PermissionError: + logging.warning(f"错误:没有权限写入文件 {full_path}。") + except Exception as e: + logging.warning(f"发生未知错误:{e}") # 读取json文件并返回内容 def read_json(file_path):