modify scripts

This commit is contained in:
oscarz
2025-06-25 08:43:05 +08:00
parent 7e14a5f247
commit 9cf521a0d6
2 changed files with 44 additions and 87 deletions

View File

@ -208,17 +208,16 @@ class JavbusDBHandler(DatabaseHandler):
def update_actor_detail(self, data, is_full_data=1): def update_actor_detail(self, data, is_full_data=1):
try: try:
# 跟新actor表 # 跟新actor表
if data.get('avatar') is not None: avatar = data.get('avatar', {})
avatar = data.get('avatar', {}) avatar['href'] = data['href']
avatar['href'] = data['href'] avatar['is_full_data'] = is_full_data
avatar['is_full_data'] = is_full_data
avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href') avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href')
logging.debug(f"update actor data. data: {avatar}")
else:
avatar_id = self.get_id_by_key(self.tbl_name_actors, 'href', data.get('href', ''))
if not avatar_id: if not avatar_id:
logging.warning(f"get actor id error. href: {data['href']}") logging.warning(f"get actor id error. href: {data['href']}")
return None return None
else:
logging.debug(f"update actor data. href: {data['href']} avatar: {avatar}")
# 更新movies表 # 更新movies表
uncensored = data.get('uncensored', 0) uncensored = data.get('uncensored', 0)
@ -393,11 +392,12 @@ class JavbusDBHandler(DatabaseHandler):
logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}") logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}")
# 插入 performers_movies 关系表 # 插入 performers_movies 关系表
uncensored = movie.get('uncensored', 0)
for performer in movie.get('actors', []): for performer in movie.get('actors', []):
performer_id = self.get_id_by_key(self.tbl_name_actors, 'href', performer['href']) performer_id = self.get_id_by_key(self.tbl_name_actors, 'href', performer['href'])
# 如果演员不存在,先插入 # 如果演员不存在,先插入
if performer_id is None: if performer_id is None:
performer_id = self.insert_actor_index(performer['name'], performer['href'], from_movie_list=1) performer_id = self.insert_actor_index({'zh_name': performer['name'], 'href':performer['href']}, uncensored=uncensored, from_movie_list=1)
logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}") logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}")
if performer_id: if performer_id:
tmp_id = self.insert_actor_movie(performer_id, movie_id) tmp_id = self.insert_actor_movie(performer_id, movie_id)

View File

@ -1,4 +1,3 @@
import json import json
import time import time
import csv import csv
@ -19,7 +18,7 @@ scraper = craw.JavbusCrawler()
debug = False debug = False
skip_local = False skip_local = False
scan_mode = 0 g_uncensored = 0
update_mode = 0 update_mode = 0
# 获取演员列表 # 获取演员列表
@ -32,7 +31,6 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
s_url = f"/{lang}/actresses" if lang != 'zh' else f"/actresses" s_url = f"/{lang}/actresses" if lang != 'zh' else f"/actresses"
current_url = urljoin(scraper.host_url, s_url) current_url = urljoin(scraper.host_url, s_url)
num = 1
while current_url: while current_url:
logging.info(f"fetching url {current_url}") logging.info(f"fetching url {current_url}")
soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id")) soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
@ -65,69 +63,30 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
# 获取演员列表 # 获取演员列表
def fetch_actor_list(): def fetch_actor_list():
#for lang in ["en", "ja", "zh"]: if g_uncensored == 1:
for lang in ['en']: for lang in ["en", "ja", "zh"]:
fetch_actor_list_lang(lang=lang, uncensored=1) #for lang in ['en']:
fetch_actor_list_lang(lang=lang, uncensored=1)
#for lang in ["en", "ja", "zh"]: elif g_uncensored ==0:
for lang in ['en']: for lang in ["en", "ja", "zh"]:
fetch_actor_list_lang(lang=lang) #for lang in ['en']:
fetch_actor_list_lang(lang=lang)
# 获取makers列表 else:
def fetch_makers_list(): for lang in ["en", "ja", "zh"]:
next_url = scraper.makers_uncensored_base_url #for lang in ['en']:
while next_url: fetch_actor_list_lang(lang=lang, uncensored=1)
logging.info(f'fetching page {next_url}') for lang in ["en", "ja", "zh"]:
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id")) #for lang in ['en']:
if soup: fetch_actor_list_lang(lang=lang)
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_makers(row, caller='list')
if maker_id:
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 获取series列表
def fetch_series_list():
next_url = scraper.series_uncensored_base_url
while next_url:
logging.info(f'fetching page {next_url}')
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
if list_data :
# 写入数据库
for row in list_data:
maker_id = db_tools.insert_or_update_series(row, caller='list')
if maker_id:
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
else:
logging.warning(f'fetch actor error. {next_url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 更新makers列表中的影片信息 # 更新makers列表中的影片信息
def fetch_movies_by_maker(): def fetch_movies_by_maker():
if debug: if debug:
url_list = db_tools.query_maker_hrefs(name='muramura') url_list = db_tools.query_maker_hrefs(name='muramura')
else: else:
if scan_mode==1: if g_uncensored==1:
url_list = db_tools.query_maker_hrefs(from_list=1) url_list = db_tools.query_maker_hrefs(from_list=1)
elif scan_mode==0: elif g_uncensored==0:
url_list = db_tools.query_maker_hrefs(from_list=0) url_list = db_tools.query_maker_hrefs(from_list=0)
else: else:
url_list = db_tools.query_maker_hrefs() url_list = db_tools.query_maker_hrefs()
@ -166,9 +125,9 @@ def fetch_movies_by_series():
if debug: if debug:
url_list = db_tools.query_series_hrefs(name='10musume') url_list = db_tools.query_series_hrefs(name='10musume')
else: else:
if scan_mode == 1: if g_uncensored == 1:
url_list = db_tools.query_series_hrefs(from_list=1) url_list = db_tools.query_series_hrefs(from_list=1)
elif scan_mode == 0: elif g_uncensored == 0:
url_list = db_tools.query_series_hrefs(from_list=0) url_list = db_tools.query_series_hrefs(from_list=0)
else: else:
url_list = db_tools.query_series_hrefs() url_list = db_tools.query_series_hrefs()
@ -206,9 +165,9 @@ def fetch_movies_by_publishers():
if debug: if debug:
url_list = db_tools.query_publishers_hrefs(limit=1) url_list = db_tools.query_publishers_hrefs(limit=1)
else: else:
if scan_mode == 1: if g_uncensored == 1:
url_list = db_tools.query_publishers_hrefs(from_list=1) url_list = db_tools.query_publishers_hrefs(from_list=1)
elif scan_mode == 0: elif g_uncensored == 0:
url_list = db_tools.query_publishers_hrefs(from_list=0) url_list = db_tools.query_publishers_hrefs(from_list=0)
else: else:
url_list = db_tools.query_publishers_hrefs() url_list = db_tools.query_publishers_hrefs()
@ -249,9 +208,9 @@ def fetch_performers_detail():
abnormal_codes = [craw.http_code_404, craw.http_code_redirect] abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_performers(**kwargs): def get_performers(**kwargs):
if scan_mode == 1: if g_uncensored == 1:
kwargs["uncensored"] = 1 kwargs["uncensored"] = 1
elif scan_mode == 0: elif g_uncensored == 0:
kwargs["uncensored"] = 0 kwargs["uncensored"] = 0
else: else:
logging.debug(f"scan all records") logging.debug(f"scan all records")
@ -344,9 +303,9 @@ def fetch_movies_detail():
abnormal_codes = [craw.http_code_404, craw.http_code_redirect] abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_movies(**kwargs): def get_movies(**kwargs):
if scan_mode == 1: if g_uncensored == 1:
kwargs["uncensored"] = 1 kwargs["uncensored"] = 1
elif scan_mode == 0: elif g_uncensored == 0:
kwargs["uncensored"] = 0 kwargs["uncensored"] = 0
else: else:
logging.debug(f"scan all records.") logging.debug(f"scan all records.")
@ -417,8 +376,6 @@ def fetch_movies_detail():
# 建立缩写到函数的映射 # 建立缩写到函数的映射
function_map = { function_map = {
"actor_list": fetch_actor_list, "actor_list": fetch_actor_list,
"maker_list": fetch_makers_list,
"series_list": fetch_series_list,
"makers": fetch_movies_by_maker, "makers": fetch_movies_by_maker,
"series" : fetch_movies_by_series, "series" : fetch_movies_by_series,
"pub" : fetch_movies_by_publishers, "pub" : fetch_movies_by_publishers,
@ -471,8 +428,8 @@ def set_env(args):
global skip_local global skip_local
skip_local = args.skip_local skip_local = args.skip_local
global scan_mode global g_uncensored
scan_mode = args.scan_mode g_uncensored = args.uncensored
global update_mode global update_mode
if args.update: if args.update:
@ -485,13 +442,13 @@ if __name__ == "__main__":
usage_examples = textwrap.dedent(''' usage_examples = textwrap.dedent('''
示例用法: 示例用法:
python3 ./fetch.py # 遍历新增的所有记录 python3 ./fetch.py # 遍历新增的所有记录
python3 ./fetch.py --scan_mode=1 # 遍历新增的 uncensored 记录(无码片) python3 ./fetch.py --uncensored=1 # 遍历新增的 uncensored 记录(无码片)
python3 ./fetch.py --scan_mode=0 # 遍历新增的 非uncensored 记录(有码片) python3 ./fetch.py --uncensored=0 # 遍历新增的 非uncensored 记录(有码片)
python3 ./fetch.py --scan_mode=2 # 遍历所有新增 python3 ./fetch.py --uncensored=2 # 遍历所有新增
python3 ./fetch.py --update=4 # 遍历全量的记录 python3 ./fetch.py --update=4 # 遍历全量的记录
python3 ./fetch.py --update=4 --scan_mode=1 # 遍历全量的 uncensored 记录(无码片) python3 ./fetch.py --update=4 --uncensored=1 # 遍历全量的 uncensored 记录(无码片)
python3 ./fetch.py --update=4 --scan_mode=0 # 遍历全量的 非uncensored 记录(有码片) python3 ./fetch.py --update=4 --uncensored=0 # 遍历全量的 非uncensored 记录(有码片)
python3 ./fetch.py --update=4 --scan_mode=2 # 遍历全量记录 python3 ./fetch.py --update=4 --uncensored=2 # 遍历全量记录
''') ''')
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@ -501,7 +458,7 @@ if __name__ == "__main__":
#parser = argparse.ArgumentParser(description='fetch javdb data.') #parser = argparse.ArgumentParser(description='fetch javdb data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有') parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量') parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作') parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
args = parser.parse_args() args = parser.parse_args()