From 9cf521a0d654cdb2deab9e4b16acc86c5ff42fe4 Mon Sep 17 00:00:00 2001 From: oscarz Date: Wed, 25 Jun 2025 08:43:05 +0800 Subject: [PATCH] modify scripts --- src/db_utils/sqlite_db.py | 18 +++--- src/javbus/fetch.py | 113 ++++++++++++-------------------------- 2 files changed, 44 insertions(+), 87 deletions(-) diff --git a/src/db_utils/sqlite_db.py b/src/db_utils/sqlite_db.py index 8670459..fb66c8b 100644 --- a/src/db_utils/sqlite_db.py +++ b/src/db_utils/sqlite_db.py @@ -208,17 +208,16 @@ class JavbusDBHandler(DatabaseHandler): def update_actor_detail(self, data, is_full_data=1): try: # 跟新actor表 - if data.get('avatar') is not None: - avatar = data.get('avatar', {}) - avatar['href'] = data['href'] - avatar['is_full_data'] = is_full_data - avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href') - logging.debug(f"update actor data. data: {avatar}") - else: - avatar_id = self.get_id_by_key(self.tbl_name_actors, 'href', data.get('href', '')) + avatar = data.get('avatar', {}) + avatar['href'] = data['href'] + avatar['is_full_data'] = is_full_data + + avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href') if not avatar_id: logging.warning(f"get actor id error. href: {data['href']}") return None + else: + logging.debug(f"update actor data. href: {data['href']} avatar: {avatar}") # 更新movies表 uncensored = data.get('uncensored', 0) @@ -393,11 +392,12 @@ class JavbusDBHandler(DatabaseHandler): logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}") # 插入 performers_movies 关系表 + uncensored = movie.get('uncensored', 0) for performer in movie.get('actors', []): performer_id = self.get_id_by_key(self.tbl_name_actors, 'href', performer['href']) # 如果演员不存在,先插入 if performer_id is None: - performer_id = self.insert_actor_index(performer['name'], performer['href'], from_movie_list=1) + performer_id = self.insert_actor_index({'zh_name': performer['name'], 'href':performer['href']}, uncensored=uncensored, from_movie_list=1) logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}") if performer_id: tmp_id = self.insert_actor_movie(performer_id, movie_id) diff --git a/src/javbus/fetch.py b/src/javbus/fetch.py index c71fcdb..de1ac26 100644 --- a/src/javbus/fetch.py +++ b/src/javbus/fetch.py @@ -1,4 +1,3 @@ - import json import time import csv @@ -19,7 +18,7 @@ scraper = craw.JavbusCrawler() debug = False skip_local = False -scan_mode = 0 +g_uncensored = 0 update_mode = 0 # 获取演员列表 @@ -32,7 +31,6 @@ def fetch_actor_list_lang(lang="en", uncensored=None): s_url = f"/{lang}/actresses" if lang != 'zh' else f"/actresses" current_url = urljoin(scraper.host_url, s_url) - num = 1 while current_url: logging.info(f"fetching url {current_url}") soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id")) @@ -65,69 +63,30 @@ def fetch_actor_list_lang(lang="en", uncensored=None): # 获取演员列表 def fetch_actor_list(): - #for lang in ["en", "ja", "zh"]: - for lang in ['en']: - fetch_actor_list_lang(lang=lang, uncensored=1) - - #for lang in ["en", "ja", "zh"]: - for lang in ['en']: - fetch_actor_list_lang(lang=lang) - -# 获取makers列表 -def fetch_makers_list(): - next_url = scraper.makers_uncensored_base_url - while next_url: - logging.info(f'fetching page {next_url}') - soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id")) - if soup: - list_data, next_url = scraper.parse_makers_uncensored(soup, next_url) - if list_data : - # 写入数据库 - for row in list_data: - maker_id = db_tools.insert_or_update_makers(row, caller='list') - if maker_id: - logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}') - else: - logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}') - else: - logging.warning(f'fetch actor error. {next_url} ...') - - elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') - break - -# 获取series列表 -def fetch_series_list(): - next_url = scraper.series_uncensored_base_url - while next_url: - logging.info(f'fetching page {next_url}') - soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id")) - if soup: - list_data, next_url = scraper.parse_series_uncensored(soup, next_url) - if list_data : - # 写入数据库 - for row in list_data: - maker_id = db_tools.insert_or_update_series(row, caller='list') - if maker_id: - logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}') - else: - logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}') - else: - logging.warning(f'fetch actor error. {next_url} ...') - - elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') - break - + if g_uncensored == 1: + for lang in ["en", "ja", "zh"]: + #for lang in ['en']: + fetch_actor_list_lang(lang=lang, uncensored=1) + elif g_uncensored ==0: + for lang in ["en", "ja", "zh"]: + #for lang in ['en']: + fetch_actor_list_lang(lang=lang) + else: + for lang in ["en", "ja", "zh"]: + #for lang in ['en']: + fetch_actor_list_lang(lang=lang, uncensored=1) + for lang in ["en", "ja", "zh"]: + #for lang in ['en']: + fetch_actor_list_lang(lang=lang) # 更新makers列表中的影片信息 def fetch_movies_by_maker(): if debug: url_list = db_tools.query_maker_hrefs(name='muramura') else: - if scan_mode==1: + if g_uncensored==1: url_list = db_tools.query_maker_hrefs(from_list=1) - elif scan_mode==0: + elif g_uncensored==0: url_list = db_tools.query_maker_hrefs(from_list=0) else: url_list = db_tools.query_maker_hrefs() @@ -166,9 +125,9 @@ def fetch_movies_by_series(): if debug: url_list = db_tools.query_series_hrefs(name='10musume') else: - if scan_mode == 1: + if g_uncensored == 1: url_list = db_tools.query_series_hrefs(from_list=1) - elif scan_mode == 0: + elif g_uncensored == 0: url_list = db_tools.query_series_hrefs(from_list=0) else: url_list = db_tools.query_series_hrefs() @@ -206,9 +165,9 @@ def fetch_movies_by_publishers(): if debug: url_list = db_tools.query_publishers_hrefs(limit=1) else: - if scan_mode == 1: + if g_uncensored == 1: url_list = db_tools.query_publishers_hrefs(from_list=1) - elif scan_mode == 0: + elif g_uncensored == 0: url_list = db_tools.query_publishers_hrefs(from_list=0) else: url_list = db_tools.query_publishers_hrefs() @@ -249,9 +208,9 @@ def fetch_performers_detail(): abnormal_codes = [craw.http_code_404, craw.http_code_redirect] def get_performers(**kwargs): - if scan_mode == 1: + if g_uncensored == 1: kwargs["uncensored"] = 1 - elif scan_mode == 0: + elif g_uncensored == 0: kwargs["uncensored"] = 0 else: logging.debug(f"scan all records") @@ -344,9 +303,9 @@ def fetch_movies_detail(): abnormal_codes = [craw.http_code_404, craw.http_code_redirect] def get_movies(**kwargs): - if scan_mode == 1: + if g_uncensored == 1: kwargs["uncensored"] = 1 - elif scan_mode == 0: + elif g_uncensored == 0: kwargs["uncensored"] = 0 else: logging.debug(f"scan all records.") @@ -417,8 +376,6 @@ def fetch_movies_detail(): # 建立缩写到函数的映射 function_map = { "actor_list": fetch_actor_list, - "maker_list": fetch_makers_list, - "series_list": fetch_series_list, "makers": fetch_movies_by_maker, "series" : fetch_movies_by_series, "pub" : fetch_movies_by_publishers, @@ -471,8 +428,8 @@ def set_env(args): global skip_local skip_local = args.skip_local - global scan_mode - scan_mode = args.scan_mode + global g_uncensored + g_uncensored = args.uncensored global update_mode if args.update: @@ -485,13 +442,13 @@ if __name__ == "__main__": usage_examples = textwrap.dedent(''' 示例用法: python3 ./fetch.py # 遍历新增的所有记录 - python3 ./fetch.py --scan_mode=1 # 遍历新增的 uncensored 记录(无码片) - python3 ./fetch.py --scan_mode=0 # 遍历新增的 非uncensored 记录(有码片) - python3 ./fetch.py --scan_mode=2 # 遍历所有新增 + python3 ./fetch.py --uncensored=1 # 遍历新增的 uncensored 记录(无码片) + python3 ./fetch.py --uncensored=0 # 遍历新增的 非uncensored 记录(有码片) + python3 ./fetch.py --uncensored=2 # 遍历所有新增 python3 ./fetch.py --update=4 # 遍历全量的记录 - python3 ./fetch.py --update=4 --scan_mode=1 # 遍历全量的 uncensored 记录(无码片) - python3 ./fetch.py --update=4 --scan_mode=0 # 遍历全量的 非uncensored 记录(有码片) - python3 ./fetch.py --update=4 --scan_mode=2 # 遍历全量记录 + python3 ./fetch.py --update=4 --uncensored=1 # 遍历全量的 uncensored 记录(无码片) + python3 ./fetch.py --update=4 --uncensored=0 # 遍历全量的 非uncensored 记录(有码片) + python3 ./fetch.py --update=4 --uncensored=2 # 遍历全量记录 ''') parser = argparse.ArgumentParser( @@ -501,7 +458,7 @@ if __name__ == "__main__": #parser = argparse.ArgumentParser(description='fetch javdb data.') parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有') - parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量') + parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量') parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作') parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') args = parser.parse_args()