This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/javdb/src/fetch.py
2025-04-25 10:08:24 +08:00

392 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import scraper
import utils
config.setup_logging()
debug = False
force = False
skip_local = False
from_actor = False
abnormal_only = False
fast_mode = False
# 获取演员列表
def fetch_actor_list():
next_url = scraper.actors_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
if actor_id:
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取makers列表
def fetch_makers_list():
next_url = scraper.makers_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_makers(row, caller='list')
if maker_id:
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取series列表
def fetch_series_list():
next_url = scraper.series_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_series(row, caller='list')
if maker_id:
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 更新makers列表中的影片信息
def fetch_movies_by_maker():
if fast_mode:
url_list = db_tools.query_maker_hrefs(from_list=1)
else:
url_list = db_tools.query_maker_hrefs()
if debug:
url_list = db_tools.query_maker_hrefs(name='muramura')
for url in url_list:
# 去掉可下载的标志(如果有)
next_url = utils.remove_url_query(url)
while next_url:
logging.info(f"Fetching data for maker url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {next_url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 调试增加brak
if debug:
return True
# 更新series列表中的影片信息
def fetch_movies_by_series():
if fast_mode:
url_list = db_tools.query_series_hrefs(from_list=1)
else:
url_list = db_tools.query_series_hrefs()
if debug:
url_list = db_tools.query_series_hrefs(name='10musume')
for url in url_list:
# 去掉可下载的标志(如果有)
next_url = utils.remove_url_query(url)
while next_url:
logging.info(f"Fetching data for series url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_series_detail(soup, next_url)
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {next_url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 调试增加brak
if debug:
return True
# 更新演员信息
def fetch_performers_detail():
limit_count = 5 if debug else 100
perfomers_list = []
last_perfomer_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
while True:
# 每次从数据库中取一部分,避免一次全量获取
if force: # 从头逐个遍历
if from_actor:
if abnormal_only:
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
else:
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
else:
if abnormal_only:
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
else:
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
else: # 只做更新
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
if len(perfomers_list) < 1:
logging.info(f'all performers fetched.')
break
succ_rows = 0
for performer in perfomers_list:
url = performer['href']
person = performer['name']
pic = ''
alias = []
next_url = url
all_movies = []
need_insert = True
while next_url:
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url)
if data:
pic = data.get('pic', '')
alias = data.get('alias', [])
all_movies.extend(data.get('movies', []))
elif status_code and status_code == scraper.http_code_404:
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404)
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == scraper.http_code_login:
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login)
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
else:
logging.warning(f'fetch_page error. url: {url}')
# 如果出现了401或者404已经处理直接跳过
if not need_insert:
continue
# 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.insert_or_update_actor({
'href': url,
'name': person,
'pic' : pic,
'alias' : alias,
'credits':all_movies
})
if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_perfomer_id = performer_id
succ_rows += 1
else:
logging.warning(f'insert person: ({person}) {url} failed.')
time.sleep(0.5)
logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}')
# 调试break
if debug:
return True
# 更新影片信息
def fetch_movies_detail():
limit_count = 10 if debug else 100
movies_list = []
last_movie_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
while True:
if force: # 从头逐个遍历
if from_actor:
if abnormal_only:
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
else:
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
else:
if abnormal_only:
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
else:
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
else: # 只做更新
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
if len(movies_list) < 1:
logging.info(f'all movies fetched.')
break
succ_count = 0
for movie in movies_list:
url = movie['href']
title = movie['title']
curr_id = movie['id']
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
# 从本地读取的文件,忽略
if skip_local and status_code == scraper.http_code_local :
last_movie_id = curr_id
succ_count += 1
continue
# 解析页面,写入数据库
if soup:
movie_data = scraper.parse_movie_detail(soup, url, title)
if movie_data :
movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
last_movie_id = movie_id
succ_count += 1
else:
logging.warning(f'insert movie {url} failed.')
else:
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == scraper.http_code_404:
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
elif status_code and status_code == scraper.http_code_login:
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login)
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. url: {url}')
time.sleep(0.5)
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
# 调试增加break
if debug:
return True
# 建立缩写到函数的映射
function_map = {
"actor_list": fetch_actor_list,
"maker_list": fetch_makers_list,
"series_list": fetch_series_list,
"makers": fetch_movies_by_maker,
"series" : fetch_movies_by_series,
"movies" : fetch_movies_detail,
"actors" : fetch_performers_detail,
}
# 主函数
def main(cmd):
# 开启任务
task_id = db_tools.insert_task_log()
if task_id is None:
logging.warning(f'insert task log error.')
return None
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, skip_local: {skip_local}, cmd: {cmd}')
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {short_name}')
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {name}')
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
logging.info(f'all process completed!')
db_tools.finalize_task_log(task_id)
# TODO:
# 1,
# 设置环境变量
def set_env(args):
global debug
debug = args.debug
if debug:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
global force
force = args.force
global skip_local
skip_local = args.skip_local
global from_actor
from_actor = args.from_actor
global abnormal_only
abnormal_only = args.abnormal_only
global fast_mode
fast_mode = args.fast_mode
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
parser = argparse.ArgumentParser(description='fetch javdb data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)')
parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)')
parser.add_argument('--fast_mode', action='store_true', help='只遍历所有 uncensored 的 makers 和 series ')
args = parser.parse_args()
set_env(args)
main(args.cmd)