import json import time import csv import argparse import logging from functools import partial import config import sqlite_utils as db_tools import scraper import utils config.setup_logging() debug = False skip_local = False scan_mode = 0 update_mode = 0 # 获取演员列表 def fetch_actor_list(): next_url = scraper.actors_uncensored_base_url while next_url: logging.info(f'fetching page {next_url}') soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id")) if soup: list_data, next_url = scraper.parse_actors_uncensored(soup, next_url) if list_data : # 写入数据库 for row in list_data: actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1) if actor_id: logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}') else: logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}') else: logging.warning(f'fetch actor error. {next_url} ...') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break # 获取makers列表 def fetch_makers_list(): next_url = scraper.makers_uncensored_base_url while next_url: logging.info(f'fetching page {next_url}') soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id")) if soup: list_data, next_url = scraper.parse_makers_uncensored(soup, next_url) if list_data : # 写入数据库 for row in list_data: maker_id = db_tools.insert_or_update_makers(row, caller='list') if maker_id: logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}') else: logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}') else: logging.warning(f'fetch actor error. {next_url} ...') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break # 获取series列表 def fetch_series_list(): next_url = scraper.series_uncensored_base_url while next_url: logging.info(f'fetching page {next_url}') soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id")) if soup: list_data, next_url = scraper.parse_series_uncensored(soup, next_url) if list_data : # 写入数据库 for row in list_data: maker_id = db_tools.insert_or_update_series(row, caller='list') if maker_id: logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}') else: logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}') else: logging.warning(f'fetch actor error. {next_url} ...') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break # 更新makers列表中的影片信息 def fetch_movies_by_maker(): if debug: url_list = db_tools.query_maker_hrefs(name='muramura') else: if scan_mode==1: url_list = db_tools.query_maker_hrefs(from_list=1) elif scan_mode==0: url_list = db_tools.query_maker_hrefs(from_list=0) else: url_list = db_tools.query_maker_hrefs() for row in url_list: url = row['href'] row_id = row['id'] uncensored = row['from_list'] if row['from_list'] > 0 else None # 去掉可下载的标志(如果有) next_url = utils.remove_url_query(url) while next_url: logging.info(f"Fetching data for maker url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class")) if soup: list_data, next_url = scraper.parse_maker_detail(soup, next_url) if list_data: for movie in list_data: tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id, uncensored=uncensored) if tmp_id: logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}') else: logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}') else : logging.warning(f'parse_page_movie error. url: {next_url}') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break # 调试增加brak if debug: return True # 更新series列表中的影片信息 def fetch_movies_by_series(): if debug: url_list = db_tools.query_series_hrefs(name='10musume') else: if scan_mode == 1: url_list = db_tools.query_series_hrefs(from_list=1) elif scan_mode == 0: url_list = db_tools.query_series_hrefs(from_list=0) else: url_list = db_tools.query_series_hrefs() for row in url_list: url = row['href'] row_id = row['id'] uncensored = row['from_list'] if row['from_list'] > 0 else None # 去掉可下载的标志(如果有) next_url = utils.remove_url_query(url) while next_url: logging.info(f"Fetching data for series url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class")) if soup: list_data, next_url = scraper.parse_series_detail(soup, next_url) if list_data: for movie in list_data: tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id, uncensored=uncensored) if tmp_id: logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}') else: logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}') else : logging.warning(f'parse_page_movie error. url: {next_url}') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break # 调试增加brak if debug: return True # 更新series列表中的影片信息 def fetch_movies_by_publishers(): if debug: url_list = db_tools.query_publishers_hrefs(limit=1) else: if scan_mode == 1: url_list = db_tools.query_publishers_hrefs(from_list=1) elif scan_mode == 0: url_list = db_tools.query_publishers_hrefs(from_list=0) else: url_list = db_tools.query_publishers_hrefs() for row in url_list: url = row['href'] row_id = row['id'] # 去掉可下载的标志(如果有) next_url = utils.remove_url_query(url) while next_url: logging.info(f"Fetching data for publisher url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class")) if soup: list_data, next_url = scraper.parse_publisher_detail(soup, next_url) if list_data: for movie in list_data: tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id) if tmp_id: logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}') else: logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}') else : logging.warning(f'parse_page_movie error. url: {next_url}') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break # 调试增加brak if debug: return True # 更新演员信息 def fetch_performers_detail(): limit_count = 5 if debug else 100 performers_list = [] last_performer_id = 0 abnormal_codes = [scraper.http_code_404, scraper.http_code_login] def get_performers(**kwargs): if scan_mode == 1: kwargs["from_actor_list"] = 1 elif scan_mode == 0: kwargs["from_actor_list"] = 0 else: logging.debug(f"scan all records") kwargs["order_by"] = 'id asc' return db_tools.query_actors(limit=limit_count, **kwargs) while True: if update_mode == 0: # 只遍历新纪录 performers_list = get_performers(start_id=0, is_full_data=0) elif update_mode == 1: # 只遍历完整纪录 performers_list = get_performers(start_id=last_performer_id, is_full_data=1) elif update_mode == 2: # 0+1 performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes) elif update_mode == 3: # 其他 performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes) else: # 全部 performers_list = get_performers(start_id=last_performer_id) if len(performers_list) < 1: logging.info(f'all performers fetched.') break succ_rows = 0 for performer in performers_list: url = performer['href'] person = performer['name'] pic = '' alias = [] next_url = url all_movies = [] need_insert = True while next_url: logging.debug(f"Fetching data for actor ({person}), url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class")) if soup: data, next_url = scraper.parse_actor_detail(soup, next_url) if data: pic = data.get('pic', '') alias = data.get('alias', []) all_movies.extend(data.get('movies', [])) elif status_code and status_code == scraper.http_code_404: actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break elif status_code and status_code == scraper.http_code_login: actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login) logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break else: logging.warning(f'fetch_page error. url: {url}') # 如果出现了401或者404,已经处理,直接跳过 if not need_insert: continue # 获取完了个人的所有影片,开始插入数据 performer_id = db_tools.insert_or_update_actor({ 'href': url, 'name': person, 'pic' : pic, 'alias' : alias, 'credits':all_movies }) if performer_id: logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') last_performer_id = performer_id succ_rows += 1 else: logging.warning(f'insert person: ({person}) {url} failed.') time.sleep(0.5) logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}') # 调试break if debug: return True # 更新影片信息 def fetch_movies_detail(): limit_count = 10 if debug else 100 movies_list = [] last_movie_id = 0 abnormal_codes = [scraper.http_code_404, scraper.http_code_login] def get_movies(**kwargs): if scan_mode == 1: kwargs["uncensored"] = 1 elif scan_mode == 0: kwargs["uncensored"] = 0 else: logging.debug(f"scan all records.") kwargs["order_by"] = 'id asc' return db_tools.query_movie_hrefs(limit=limit_count, **kwargs) while True: if update_mode == 0: # 只遍历新纪录 movies_list = get_movies(start_id=0, is_full_data=0) elif update_mode == 1: # 只遍历完整纪录 movies_list = get_movies(start_id=last_movie_id, is_full_data=1) elif update_mode == 2: # 0+1 movies_list = get_movies(start_id=last_movie_id, is_full_data_not_in=abnormal_codes) elif update_mode == 3: # 其他 movies_list = get_movies(start_id=last_movie_id, is_full_data_in =abnormal_codes) else: # 全部 movies_list = get_movies(start_id=last_movie_id) if len(movies_list) < 1: logging.info(f'all performers fetched.') break succ_count = 0 for movie in movies_list: url = movie['href'] title = movie['title'] curr_id = movie['id'] logging.debug(f"Fetching data for movie ({title}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class")) # 从本地读取的文件,忽略 if skip_local and status_code == scraper.http_code_local : last_movie_id = curr_id succ_count += 1 continue # 解析页面,写入数据库 if soup: movie_data = scraper.parse_movie_detail(soup, url, title) if movie_data : movie_id = db_tools.insert_or_update_movie(movie_data) if movie_id: logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}') last_movie_id = movie_id succ_count += 1 else: logging.warning(f'insert movie {url} failed.') else: logging.warning(f'parse_page_movie error. url: {url}') elif status_code and status_code == scraper.http_code_404: movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404) logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') elif status_code and status_code == scraper.http_code_login: movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login) logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') time.sleep(0.5) logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}') # 调试增加break if debug: return True # 建立缩写到函数的映射 function_map = { "actor_list": fetch_actor_list, "maker_list": fetch_makers_list, "series_list": fetch_series_list, "makers": fetch_movies_by_maker, "series" : fetch_movies_by_series, "pub" : fetch_movies_by_publishers, "movies" : fetch_movies_detail, "actors" : fetch_performers_detail, } # 主函数 def main(cmd, args): # 开启任务 task_id = db_tools.insert_task_log() if task_id is None: logging.warning(f'insert task log error.') return None logging.info(f"running task. id: {task_id}, args: {args}") # 执行指定的函数 if cmd: function_names = args.cmd.split(",") # 拆分输入 for short_name in function_names: func = function_map.get(short_name.strip()) # 从映射中获取对应的函数 if callable(func): db_tools.update_task_log(task_id, task_status=f'Running {short_name}') func() else: logging.warning(f" {short_name} is not a valid function shortcut.") else: # 全量执行 for name, func in function_map.items(): if callable(func): db_tools.update_task_log(task_id, task_status=f'Running {name}') func() else: logging.warning(f" {short_name} is not a valid function shortcut.") logging.info(f'all process completed!') db_tools.finalize_task_log(task_id) # TODO: # 1, # 设置环境变量 def set_env(args): global debug debug = args.debug if debug: logger = logging.getLogger() logger.setLevel(logging.DEBUG) global skip_local skip_local = args.skip_local global scan_mode scan_mode = args.scan_mode global update_mode if args.update: update_mode = args.update if __name__ == "__main__": # 命令行参数处理 keys_str = ",".join(function_map.keys()) parser = argparse.ArgumentParser(description='fetch javdb data.') parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有') parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=2, help='0-只遍历所有 uncensored 的 makers/series/actors/movies, 1-与前者相反, 2-全量(默认)') parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作') parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') args = parser.parse_args() set_env(args) main(args.cmd, args) ''' python3 ./fetch # 遍历新增的所有记录 python3 ./fetch --scan_mode=0 # 遍历新增的 uncensored 记录(无码片) python3 ./fetch --scan_mode=1 # 遍历新增的 非uncensored 记录(有码片) python3 ./fetch --update=4 # 遍历全量的记录 python3 ./fetch --update=4 --scan_mode=0 # 遍历全量的 uncensored 记录(无码片) python3 ./fetch --update=4 --scan_mode=1 # 遍历全量的 非uncensored 记录(有码片) '''