225 lines
8.4 KiB
Python
225 lines
8.4 KiB
Python
|
||
import json
|
||
import time
|
||
import csv
|
||
import argparse
|
||
import textwrap
|
||
import logging
|
||
from functools import partial
|
||
import config
|
||
import sqlite_utils as db_tools
|
||
import scraper
|
||
import utils
|
||
from urllib.parse import urljoin, urlparse
|
||
|
||
config.setup_logging()
|
||
|
||
debug = False
|
||
skip_local = False
|
||
scan_mode = 0
|
||
update_mode = 0
|
||
|
||
# 获取演员列表
|
||
def fetch_actor_list_lang(lang="en"):
|
||
s_url = f"/{lang}/model"
|
||
current_url = urljoin(scraper.host_url, s_url)
|
||
num = 1
|
||
while current_url:
|
||
logging.info(f"fetching url {current_url}")
|
||
data = scraper.fetch_post_page(current_url)
|
||
|
||
if not data:
|
||
logging.warning(f"fetch {current_url} error.")
|
||
break
|
||
|
||
# 检查 JSON 结构
|
||
if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
|
||
logging.warning(f"[错误] 数据结构异常: {data}")
|
||
break
|
||
|
||
# 解析数据
|
||
all_data = scraper.parse_list_json(data, num=num, lang=lang)
|
||
|
||
# 插入到数据库
|
||
for row in all_data:
|
||
# 非en的话,只保留name
|
||
if lang != 'en':
|
||
new_row = {}
|
||
new_row['url'] = utils.replace_lang_param(row['url'])
|
||
new_row[f"{lang}_name"] = row[f"{lang}_name"]
|
||
insert_row = new_row
|
||
else:
|
||
insert_row = row
|
||
row_id = db_tools.insert_actor_index(insert_row)
|
||
if row_id:
|
||
logging.debug(f"insert or update one row. row id: {row_id}, data: {insert_row}")
|
||
else:
|
||
logging.warning(f"insert or update actor failed. data: {insert_row}")
|
||
|
||
# 获取下一页
|
||
next_path = data.get("pagination_params", {}).get("next")
|
||
if next_path:
|
||
current_url = urljoin(scraper.host_url, next_path)
|
||
logging.debug(f"next page: {current_url}")
|
||
num += 1
|
||
time.sleep(0.2)
|
||
else:
|
||
logging.info(f"all pages fetched. lang: {lang}")
|
||
break
|
||
|
||
# 调试break
|
||
if debug:
|
||
return True
|
||
|
||
|
||
# 获取演员列表
|
||
def fetch_actor_list():
|
||
for lang in ["en", "ja", "zh"]:
|
||
fetch_actor_list_lang(lang=lang)
|
||
|
||
|
||
# 更新演员信息
|
||
def fetch_performers_detail():
|
||
limit_count = 5 if debug else 100
|
||
performers_list = []
|
||
last_performer_id = 0
|
||
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||
|
||
def get_performers(**kwargs):
|
||
kwargs["order_by"] = 'id asc'
|
||
return db_tools.query_actors(limit=limit_count, **kwargs)
|
||
|
||
while True:
|
||
if update_mode == 0: # 只遍历新纪录
|
||
performers_list = get_performers(start_id=0, is_full_data=0)
|
||
elif update_mode == 1: # 只遍历完整纪录
|
||
performers_list = get_performers(start_id=last_performer_id, is_full_data=1)
|
||
elif update_mode == 2: # 0+1
|
||
performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes)
|
||
elif update_mode == 3: # 其他
|
||
performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes)
|
||
else: # 全部
|
||
performers_list = get_performers(start_id=last_performer_id)
|
||
|
||
if len(performers_list) < 1:
|
||
logging.info(f'all performers fetched.')
|
||
break
|
||
|
||
succ_rows = 0
|
||
for performer in performers_list:
|
||
url = performer['url']
|
||
person = performer['name']
|
||
|
||
next_url = url
|
||
need_insert = True
|
||
while next_url:
|
||
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="info__features", attr_type="class"))
|
||
if soup:
|
||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||
if data:
|
||
# 获取完了个人的所有影片,开始插入数据
|
||
performer_id = db_tools.update_actor_detail(data, is_full_data=1)
|
||
if performer_id:
|
||
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||
last_performer_id = performer_id
|
||
succ_rows += 1
|
||
else:
|
||
logging.warning(f'insert person: ({person}) {url} failed.')
|
||
|
||
elif status_code and status_code == scraper.http_code_404:
|
||
actor_id = db_tools.update_actor_detail({'url': url}, is_full_data=scraper.http_code_404)
|
||
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||
need_insert = False
|
||
break
|
||
elif status_code and status_code == scraper.http_code_login:
|
||
actor_id = db_tools.update_actor_detail({'url': url}, is_full_data=scraper.http_code_login)
|
||
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||
need_insert = False
|
||
break
|
||
else:
|
||
logging.warning(f'fetch_page error. url: {url}')
|
||
|
||
# 如果出现了401或者404,已经处理,直接跳过
|
||
if not need_insert:
|
||
continue
|
||
time.sleep(0.5)
|
||
|
||
logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}')
|
||
# 调试break
|
||
if debug:
|
||
return True
|
||
|
||
# 建立缩写到函数的映射
|
||
function_map = {
|
||
"actor_list": fetch_actor_list,
|
||
"actors" : fetch_performers_detail,
|
||
}
|
||
|
||
# 主函数
|
||
def main(cmd, args):
|
||
# 执行指定的函数
|
||
if cmd:
|
||
function_names = args.cmd.split(",") # 拆分输入
|
||
for short_name in function_names:
|
||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||
if callable(func):
|
||
func()
|
||
else:
|
||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||
else: # 全量执行
|
||
for name, func in function_map.items():
|
||
if callable(func):
|
||
func()
|
||
else:
|
||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||
|
||
logging.info(f'all process completed!')
|
||
|
||
# TODO:
|
||
# 1,
|
||
|
||
# 设置环境变量
|
||
def set_env(args):
|
||
global debug
|
||
debug = args.debug
|
||
if debug:
|
||
logger = logging.getLogger()
|
||
logger.setLevel(logging.DEBUG)
|
||
|
||
global skip_local
|
||
skip_local = args.skip_local
|
||
|
||
global scan_mode
|
||
scan_mode = args.scan_mode
|
||
|
||
global update_mode
|
||
if args.update:
|
||
update_mode = args.update
|
||
|
||
if __name__ == "__main__":
|
||
# 命令行参数处理
|
||
keys_str = ",".join(function_map.keys())
|
||
|
||
usage_examples = textwrap.dedent('''
|
||
示例用法:
|
||
python3 ./fetch.py # 刷新列表页,并遍历新增的演员
|
||
python3 ./fetch.py --update=4 # 刷新列表页,并遍历全量的记录
|
||
python3 ./fetch.py --cmd=actor_list # 刷新列表页所有演员(三种语言)
|
||
python3 ./fetch.py --cmd=actors # 遍历新增的演员
|
||
''')
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description='fetch javhd data.\n\n' + usage_examples,
|
||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||
)
|
||
#parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
|
||
parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
|
||
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
|
||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||
args = parser.parse_args()
|
||
|
||
set_env(args)
|
||
main(args.cmd, args) |