import json import time import csv import argparse import textwrap import logging from functools import partial import config import sqlite_utils as db_tools import scraper import utils from urllib.parse import urljoin, urlparse config.setup_logging() debug = False skip_local = False scan_mode = 0 update_mode = 0 # 获取演员列表 def fetch_actor_list_lang(lang="en"): s_url = f"/{lang}/model" current_url = urljoin(scraper.host_url, s_url) num = 1 while current_url: logging.info(f"fetching url {current_url}") data = scraper.fetch_post_page(current_url) if not data: logging.warning(f"fetch {current_url} error.") break # 检查 JSON 结构 if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]): logging.warning(f"[错误] 数据结构异常: {data}") break # 解析数据 all_data = scraper.parse_list_json(data, num=num, lang=lang) # 插入到数据库 for row in all_data: # 非en的话,只保留name if lang != 'en': new_row = {} new_row['url'] = utils.replace_lang_param(row['url']) new_row[f"{lang}_name"] = row[f"{lang}_name"] insert_row = new_row else: insert_row = row row_id = db_tools.insert_actor_index(insert_row) if row_id: logging.debug(f"insert or update one row. row id: {row_id}, data: {insert_row}") else: logging.warning(f"insert or update actor failed. data: {insert_row}") # 获取下一页 next_path = data.get("pagination_params", {}).get("next") if next_path: current_url = urljoin(scraper.host_url, next_path) logging.debug(f"next page: {current_url}") num += 1 time.sleep(0.2) else: logging.info(f"all pages fetched. lang: {lang}") break # 调试break if debug: return True # 获取演员列表 def fetch_actor_list(): for lang in ["en", "ja", "zh"]: fetch_actor_list_lang(lang=lang) # 更新演员信息 def fetch_performers_detail(): limit_count = 5 if debug else 100 performers_list = [] last_performer_id = 0 abnormal_codes = [scraper.http_code_404, scraper.http_code_login] def get_performers(**kwargs): kwargs["order_by"] = 'id asc' return db_tools.query_actors(limit=limit_count, **kwargs) while True: if update_mode == 0: # 只遍历新纪录 performers_list = get_performers(start_id=0, is_full_data=0) elif update_mode == 1: # 只遍历完整纪录 performers_list = get_performers(start_id=last_performer_id, is_full_data=1) elif update_mode == 2: # 0+1 performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes) elif update_mode == 3: # 其他 performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes) else: # 全部 performers_list = get_performers(start_id=last_performer_id) if len(performers_list) < 1: logging.info(f'all performers fetched.') break succ_rows = 0 for performer in performers_list: url = performer['url'] person = performer['name'] next_url = url need_insert = True while next_url: logging.debug(f"Fetching data for actor ({person}), url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="info__features", attr_type="class")) if soup: data, next_url = scraper.parse_actor_detail(soup, next_url) if data: # 获取完了个人的所有影片,开始插入数据 performer_id = db_tools.update_actor_detail(data, is_full_data=1) if performer_id: logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') last_performer_id = performer_id succ_rows += 1 else: logging.warning(f'insert person: ({person}) {url} failed.') elif status_code and status_code == scraper.http_code_404: actor_id = db_tools.update_actor_detail({'url': url}, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break elif status_code and status_code == scraper.http_code_login: actor_id = db_tools.update_actor_detail({'url': url}, is_full_data=scraper.http_code_login) logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break else: logging.warning(f'fetch_page error. url: {url}') # 如果出现了401或者404,已经处理,直接跳过 if not need_insert: continue time.sleep(0.5) logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}') # 调试break if debug: return True # 建立缩写到函数的映射 function_map = { "actor_list": fetch_actor_list, "actors" : fetch_performers_detail, } # 主函数 def main(cmd, args): # 执行指定的函数 if cmd: function_names = args.cmd.split(",") # 拆分输入 for short_name in function_names: func = function_map.get(short_name.strip()) # 从映射中获取对应的函数 if callable(func): func() else: logging.warning(f" {short_name} is not a valid function shortcut.") else: # 全量执行 for name, func in function_map.items(): if callable(func): func() else: logging.warning(f" {short_name} is not a valid function shortcut.") logging.info(f'all process completed!') # TODO: # 1, # 设置环境变量 def set_env(args): global debug debug = args.debug if debug: logger = logging.getLogger() logger.setLevel(logging.DEBUG) global skip_local skip_local = args.skip_local global scan_mode scan_mode = args.scan_mode global update_mode if args.update: update_mode = args.update if __name__ == "__main__": # 命令行参数处理 keys_str = ",".join(function_map.keys()) usage_examples = textwrap.dedent(''' 示例用法: python3 ./fetch.py # 刷新列表页,并遍历新增的演员 python3 ./fetch.py --update=4 # 刷新列表页,并遍历全量的记录 python3 ./fetch.py --cmd=actor_list # 刷新列表页所有演员(三种语言) python3 ./fetch.py --cmd=actors # 遍历新增的演员 ''') parser = argparse.ArgumentParser( description='fetch javhd data.\n\n' + usage_examples, formatter_class=argparse.RawDescriptionHelpFormatter ) #parser = argparse.ArgumentParser(description='fetch javdb data.') parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有') parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量') parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作') parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') args = parser.parse_args() set_env(args) main(args.cmd, args)