modify scripts

This commit is contained in:
oscarz
2025-06-25 08:43:05 +08:00
parent 7e14a5f247
commit 9cf521a0d6
2 changed files with 44 additions and 87 deletions

View File

@ -1,4 +1,3 @@
import json
import time
import csv
@ -19,7 +18,7 @@ scraper = craw.JavbusCrawler()
debug = False
skip_local = False
scan_mode = 0
g_uncensored = 0
update_mode = 0
# 获取演员列表
@ -32,7 +31,6 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
s_url = f"/{lang}/actresses" if lang != 'zh' else f"/actresses"
current_url = urljoin(scraper.host_url, s_url)
num = 1
while current_url:
logging.info(f"fetching url {current_url}")
soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
@ -65,69 +63,30 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
# 获取演员列表
def fetch_actor_list():
#for lang in ["en", "ja", "zh"]:
for lang in ['en']:
fetch_actor_list_lang(lang=lang, uncensored=1)
#for lang in ["en", "ja", "zh"]:
for lang in ['en']:
fetch_actor_list_lang(lang=lang)
# 获取makers列表
def fetch_makers_list():
next_url = scraper.makers_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_makers(row, caller='list')
if maker_id:
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取series列表
def fetch_series_list():
next_url = scraper.series_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_series(row, caller='list')
if maker_id:
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
if g_uncensored == 1:
for lang in ["en", "ja", "zh"]:
#for lang in ['en']:
fetch_actor_list_lang(lang=lang, uncensored=1)
elif g_uncensored ==0:
for lang in ["en", "ja", "zh"]:
#for lang in ['en']:
fetch_actor_list_lang(lang=lang)
else:
for lang in ["en", "ja", "zh"]:
#for lang in ['en']:
fetch_actor_list_lang(lang=lang, uncensored=1)
for lang in ["en", "ja", "zh"]:
#for lang in ['en']:
fetch_actor_list_lang(lang=lang)
# 更新makers列表中的影片信息
def fetch_movies_by_maker():
if debug:
url_list = db_tools.query_maker_hrefs(name='muramura')
else:
if scan_mode==1:
if g_uncensored==1:
url_list = db_tools.query_maker_hrefs(from_list=1)
elif scan_mode==0:
elif g_uncensored==0:
url_list = db_tools.query_maker_hrefs(from_list=0)
else:
url_list = db_tools.query_maker_hrefs()
@ -166,9 +125,9 @@ def fetch_movies_by_series():
if debug:
url_list = db_tools.query_series_hrefs(name='10musume')
else:
if scan_mode == 1:
if g_uncensored == 1:
url_list = db_tools.query_series_hrefs(from_list=1)
elif scan_mode == 0:
elif g_uncensored == 0:
url_list = db_tools.query_series_hrefs(from_list=0)
else:
url_list = db_tools.query_series_hrefs()
@ -206,9 +165,9 @@ def fetch_movies_by_publishers():
if debug:
url_list = db_tools.query_publishers_hrefs(limit=1)
else:
if scan_mode == 1:
if g_uncensored == 1:
url_list = db_tools.query_publishers_hrefs(from_list=1)
elif scan_mode == 0:
elif g_uncensored == 0:
url_list = db_tools.query_publishers_hrefs(from_list=0)
else:
url_list = db_tools.query_publishers_hrefs()
@ -249,9 +208,9 @@ def fetch_performers_detail():
abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_performers(**kwargs):
if scan_mode == 1:
if g_uncensored == 1:
kwargs["uncensored"] = 1
elif scan_mode == 0:
elif g_uncensored == 0:
kwargs["uncensored"] = 0
else:
logging.debug(f"scan all records")
@ -344,9 +303,9 @@ def fetch_movies_detail():
abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_movies(**kwargs):
if scan_mode == 1:
if g_uncensored == 1:
kwargs["uncensored"] = 1
elif scan_mode == 0:
elif g_uncensored == 0:
kwargs["uncensored"] = 0
else:
logging.debug(f"scan all records.")
@ -417,8 +376,6 @@ def fetch_movies_detail():
# 建立缩写到函数的映射
function_map = {
"actor_list": fetch_actor_list,
"maker_list": fetch_makers_list,
"series_list": fetch_series_list,
"makers": fetch_movies_by_maker,
"series" : fetch_movies_by_series,
"pub" : fetch_movies_by_publishers,
@ -471,8 +428,8 @@ def set_env(args):
global skip_local
skip_local = args.skip_local
global scan_mode
scan_mode = args.scan_mode
global g_uncensored
g_uncensored = args.uncensored
global update_mode
if args.update:
@ -485,13 +442,13 @@ if __name__ == "__main__":
usage_examples = textwrap.dedent('''
示例用法:
python3 ./fetch.py # 遍历新增的所有记录
python3 ./fetch.py --scan_mode=1 # 遍历新增的 uncensored 记录(无码片)
python3 ./fetch.py --scan_mode=0 # 遍历新增的 非uncensored 记录(有码片)
python3 ./fetch.py --scan_mode=2 # 遍历所有新增
python3 ./fetch.py --uncensored=1 # 遍历新增的 uncensored 记录(无码片)
python3 ./fetch.py --uncensored=0 # 遍历新增的 非uncensored 记录(有码片)
python3 ./fetch.py --uncensored=2 # 遍历所有新增
python3 ./fetch.py --update=4 # 遍历全量的记录
python3 ./fetch.py --update=4 --scan_mode=1 # 遍历全量的 uncensored 记录(无码片)
python3 ./fetch.py --update=4 --scan_mode=0 # 遍历全量的 非uncensored 记录(有码片)
python3 ./fetch.py --update=4 --scan_mode=2 # 遍历全量记录
python3 ./fetch.py --update=4 --uncensored=1 # 遍历全量的 uncensored 记录(无码片)
python3 ./fetch.py --update=4 --uncensored=0 # 遍历全量的 非uncensored 记录(有码片)
python3 ./fetch.py --update=4 --uncensored=2 # 遍历全量记录
''')
parser = argparse.ArgumentParser(
@ -501,7 +458,7 @@ if __name__ == "__main__":
#parser = argparse.ArgumentParser(description='fetch javdb data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
args = parser.parse_args()