From 434c7459204e06cc3aebce834e1942e77e9484c8 Mon Sep 17 00:00:00 2001 From: oscarz Date: Thu, 24 Apr 2025 17:24:13 +0800 Subject: [PATCH] modify scripts --- iafd/src/fetch.py | 25 ++++++++-------- iafd/src/iafd_scraper.py | 13 ++++++--- javdb/src/fetch.py | 61 ++++++++++++++++++++++++++++++---------- javdb/src/scraper.py | 16 +++++++---- 4 files changed, 78 insertions(+), 37 deletions(-) diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index 001bab6..c8a4249 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -247,7 +247,7 @@ def fetch_performers_detail_once(perfomers_list): logging.debug(f"Fetching data for performer ({person}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) # 从本地读取的文件,忽略 - if skip_local and status_code == 99 : + if skip_local and status_code == scraper.http_code_local : last_performer_id = curr_id continue if soup: @@ -272,11 +272,11 @@ def fetch_performers_detail_once(perfomers_list): }) else: logging.warning(f'parse_page_performer error. person: ({person}), url: {url}') - elif status_code and status_code == 404: - performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=2) + elif status_code and status_code == scraper.http_code_404: + performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...') - elif status_code and status_code == 601: - performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=3) + elif status_code and status_code == scraper.http_code_url: + performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_url) logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...') else: logging.warning(f'fetch_page error. person: ({person}), url: {url}') @@ -293,7 +293,7 @@ def fetch_performers_detail(): # 获取新演员的列表 while True: if force: # 从头逐个遍历 - perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count) + perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count) else: # 只做更新 perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count) if len(perfomers_list) < 1: @@ -322,7 +322,7 @@ def fetch_movies_detail(): last_movie_id = 0 while True: if force: # 从头逐个遍历 - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count) + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count) else: # 只做更新 movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) if len(movies_list) < 1: @@ -336,8 +336,9 @@ def fetch_movies_detail(): logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) # 从本地读取的文件,忽略 - if skip_local and status_code == 99 : + if skip_local and status_code == scraper.http_code_local : last_movie_id = curr_id + succ_count += 1 continue if soup: movie_data = scraper.parse_page_movie(soup, url, title) @@ -359,13 +360,13 @@ def fetch_movies_detail(): utils.write_movie_json(url, movie_data) else: logging.warning(f'parse_page_movie error. url: {url}') - elif status_code and status_code == 404: + elif status_code and status_code == scraper.http_code_404: # 标记为已处理 - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2) + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') - elif status_code and status_code == 601: + elif status_code and status_code == scraper.http_code_url: # 标记为已处理 - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3) + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_url) logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py index ef7a778..1ba861b 100644 --- a/iafd/src/iafd_scraper.py +++ b/iafd/src/iafd_scraper.py @@ -36,6 +36,11 @@ headers = { } scraper = cloudscraper.create_scraper() +http_code_404 = 404 +http_code_login = 401 +http_code_url = 601 +http_code_local = 99 + save_raw_html = True load_from_local = True @@ -49,27 +54,27 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 - return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的 + return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的 for attempt in range(max_retries): try: if host_url not in url.lower(): logging.error(f'wrong url format: {url}') - return None, 601 + return None, http_code_url response = scraper.get(url, headers=headers) # 处理 HTTP 状态码 if response.status_code == 404: logging.debug(f"Page not found (404): {url}") - return None, 404 # 直接返回 404,调用方可以跳过 + return None, http_code_404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 # 过期的网页,与404相同处理 if "invalid or outdated page" in response.text.lower(): logging.debug(f"invalid or outdated page: {url}") - return None, 404 # 直接返回 404,调用方可以跳过 + return None, http_code_404 # 直接返回 404,调用方可以跳过 if save_raw_html: utils.write_raw_html(url, response.text) diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py index cf4824e..61bfb6b 100644 --- a/javdb/src/fetch.py +++ b/javdb/src/fetch.py @@ -14,7 +14,9 @@ config.setup_logging() debug = False force = False -skip_local = True +skip_local = False +from_actor = False +abnormal_only = False # 获取演员列表 def fetch_actor_list(): @@ -152,10 +154,20 @@ def fetch_performers_detail(): limit_count = 5 if debug else 100 perfomers_list = [] last_perfomer_id = 0 + abnormal_codes = [scraper.http_code_404, scraper.http_code_login] while True: # 每次从数据库中取一部分,避免一次全量获取 if force: # 从头逐个遍历 - perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1) + if from_actor: + if abnormal_only: + perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) + else: + perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) + else: + if abnormal_only: + perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) + else: + perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) else: # 只做更新 perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count) if len(perfomers_list) < 1: @@ -182,13 +194,13 @@ def fetch_performers_detail(): alias = data.get('alias', []) all_movies.extend(data.get('movies', [])) - elif status_code and status_code == 404: - actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2) + elif status_code and status_code == scraper.http_code_404: + actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break - elif status_code and status_code == 401: - actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3) + elif status_code and status_code == scraper.http_code_login: + actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login) logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break @@ -225,9 +237,19 @@ def fetch_movies_detail(): limit_count = 10 if debug else 100 movies_list = [] last_movie_id = 0 + abnormal_codes = [scraper.http_code_404, scraper.http_code_login] while True: if force: # 从头逐个遍历 - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1) + if from_actor: + if abnormal_only: + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) + else: + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) + else: + if abnormal_only: + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) + else: + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) else: # 只做更新 movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) if len(movies_list) < 1: @@ -241,8 +263,9 @@ def fetch_movies_detail(): logging.debug(f"Fetching data for movie ({title}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class")) # 从本地读取的文件,忽略 - if skip_local and status_code == 99 : + if skip_local and status_code == scraper.http_code_local : last_movie_id = curr_id + succ_count += 1 continue # 解析页面,写入数据库 if soup: @@ -258,11 +281,11 @@ def fetch_movies_detail(): else: logging.warning(f'parse_page_movie error. url: {url}') - elif status_code and status_code == 404: - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2) + elif status_code and status_code == scraper.http_code_404: + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') - elif status_code and status_code == 401: - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3) + elif status_code and status_code == scraper.http_code_login: + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login) logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') @@ -285,12 +308,12 @@ function_map = { } # 主函数 -def main(cmd, args_debug, args_force, args_skip_local): +def main(cmd, args_debug, args_force, args_skip_local, args_from_actor, args_abnormal_only): global debug debug = args_debug if debug: logger = logging.getLogger() - #logger.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) global force force = args_force @@ -298,6 +321,12 @@ def main(cmd, args_debug, args_force, args_skip_local): global skip_local skip_local = args_skip_local + global from_actor + from_actor = args_from_actor + + global abnormal_only + abnormal_only = args_abnormal_only + # 开启任务 task_id = db_tools.insert_task_log() if task_id is None: @@ -339,6 +368,8 @@ if __name__ == "__main__": parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)') parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)') + parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)') + parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)') args = parser.parse_args() - main(args.cmd, args.debug, args.force, args.skip_local) + main(args.cmd, args.debug, args.force, args.skip_local, args.from_actor, args.abnormal_only) diff --git a/javdb/src/scraper.py b/javdb/src/scraper.py index 84867fb..44c137f 100644 --- a/javdb/src/scraper.py +++ b/javdb/src/scraper.py @@ -25,6 +25,10 @@ headers = { } scraper = cloudscraper.create_scraper() +http_code_404 = 404 +http_code_login = 401 +http_code_local = 99 + save_raw_html = True load_from_local = True @@ -38,8 +42,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 - logging.info(f"read from local. href: {url}") - return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的 + logging.debug(f"read from local. href: {url}") + return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的 for attempt in range(max_retries): try: @@ -51,8 +55,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor # 处理 HTTP 状态码 if response.status_code == 404: - logging.warning(f"Page not found (404): {url}") - return None, 404 # 直接返回 404,调用方可以跳过 + logging.debug(f"Page not found (404): {url}") + return None, http_code_404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 @@ -62,8 +66,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor soup = BeautifulSoup(response.text, parser) # 判断是否为登录页面, if soup.find('nav', class_='panel form-panel'): - logging.warning(f"Page redirected to login page on {url}.") - return None, 401 + logging.debug(f"Page redirected to login page on {url}.") + return None, http_code_login if save_raw_html: utils.write_raw_html(url, response.text)