This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/javhd/src/fetch.py
2025-06-03 10:20:03 +08:00

225 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import time
import csv
import argparse
import textwrap
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import scraper
import utils
from urllib.parse import urljoin, urlparse
config.setup_logging()
debug = False
skip_local = False
scan_mode = 0
update_mode = 0
# 获取演员列表
def fetch_actor_list_lang(lang="en"):
s_url = f"/{lang}/model"
current_url = urljoin(scraper.host_url, s_url)
num = 1
while current_url:
logging.info(f"fetching url {current_url}")
data = scraper.fetch_post_page(current_url)
if not data:
logging.warning(f"fetch {current_url} error.")
break
# 检查 JSON 结构
if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
logging.warning(f"[错误] 数据结构异常: {data}")
break
# 解析数据
all_data = scraper.parse_list_json(data, num=num, lang=lang)
# 插入到数据库
for row in all_data:
# 非en的话只保留name
if lang != 'en':
new_row = {}
new_row['url'] = utils.replace_lang_param(row['url'])
new_row[f"{lang}_name"] = row[f"{lang}_name"]
insert_row = new_row
else:
insert_row = row
row_id = db_tools.insert_actor_index(insert_row)
if row_id:
logging.debug(f"insert or update one row. row id: {row_id}, data: {insert_row}")
else:
logging.warning(f"insert or update actor failed. data: {insert_row}")
# 获取下一页
next_path = data.get("pagination_params", {}).get("next")
if next_path:
current_url = urljoin(scraper.host_url, next_path)
logging.debug(f"next page: {current_url}")
num += 1
time.sleep(0.2)
else:
logging.info(f"all pages fetched. lang: {lang}")
break
# 调试break
if debug:
return True
# 获取演员列表
def fetch_actor_list():
for lang in ["en", "ja", "zh"]:
fetch_actor_list_lang(lang=lang)
# 更新演员信息
def fetch_performers_detail():
limit_count = 5 if debug else 100
performers_list = []
last_performer_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
def get_performers(**kwargs):
kwargs["order_by"] = 'id asc'
return db_tools.query_actors(limit=limit_count, **kwargs)
while True:
if update_mode == 0: # 只遍历新纪录
performers_list = get_performers(start_id=0, is_full_data=0)
elif update_mode == 1: # 只遍历完整纪录
performers_list = get_performers(start_id=last_performer_id, is_full_data=1)
elif update_mode == 2: # 0+1
performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes)
elif update_mode == 3: # 其他
performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes)
else: # 全部
performers_list = get_performers(start_id=last_performer_id)
if len(performers_list) < 1:
logging.info(f'all performers fetched.')
break
succ_rows = 0
for performer in performers_list:
url = performer['url']
person = performer['name']
next_url = url
need_insert = True
while next_url:
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="info__features", attr_type="class"))
if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url)
if data:
# 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.update_actor_detail(data, is_full_data=1)
if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_performer_id = performer_id
succ_rows += 1
else:
logging.warning(f'insert person: ({person}) {url} failed.')
elif status_code and status_code == scraper.http_code_404:
actor_id = db_tools.update_actor_detail({'url': url}, is_full_data=scraper.http_code_404)
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == scraper.http_code_login:
actor_id = db_tools.update_actor_detail({'url': url}, is_full_data=scraper.http_code_login)
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
else:
logging.warning(f'fetch_page error. url: {url}')
# 如果出现了401或者404已经处理直接跳过
if not need_insert:
continue
time.sleep(0.5)
logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}')
# 调试break
if debug:
return True
# 建立缩写到函数的映射
function_map = {
"actor_list": fetch_actor_list,
"actors" : fetch_performers_detail,
}
# 主函数
def main(cmd, args):
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
logging.info(f'all process completed!')
# TODO:
# 1,
# 设置环境变量
def set_env(args):
global debug
debug = args.debug
if debug:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
global skip_local
skip_local = args.skip_local
global scan_mode
scan_mode = args.scan_mode
global update_mode
if args.update:
update_mode = args.update
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
usage_examples = textwrap.dedent('''
示例用法:
python3 ./fetch.py # 刷新列表页,并遍历新增的演员
python3 ./fetch.py --update=4 # 刷新列表页,并遍历全量的记录
python3 ./fetch.py --cmd=actor_list # 刷新列表页所有演员(三种语言)
python3 ./fetch.py --cmd=actors # 遍历新增的演员
''')
parser = argparse.ArgumentParser(
description='fetch javhd data.\n\n' + usage_examples,
formatter_class=argparse.RawDescriptionHelpFormatter
)
#parser = argparse.ArgumentParser(description='fetch javdb data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
args = parser.parse_args()
set_env(args)
main(args.cmd, args)