diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py index 6b3b9c8..ea103c5 100644 --- a/javdb/src/fetch.py +++ b/javdb/src/fetch.py @@ -13,11 +13,9 @@ import utils config.setup_logging() debug = False -force = False skip_local = False -from_actor = False -abnormal_only = False fast_mode = False +update_mode = 0 # 获取演员列表 def fetch_actor_list(): @@ -90,16 +88,18 @@ def fetch_series_list(): # 更新makers列表中的影片信息 def fetch_movies_by_maker(): - if fast_mode: - url_list = db_tools.query_maker_hrefs(from_list=1) - else: - url_list = db_tools.query_maker_hrefs() - if debug: url_list = db_tools.query_maker_hrefs(name='muramura') + else: + if fast_mode: + url_list = db_tools.query_maker_hrefs(from_list=1) + else: + url_list = db_tools.query_maker_hrefs() + for row in url_list: url = row['href'] row_id = row['id'] + uncensored = row['from_list'] if row['from_list'] > 0 else None # 去掉可下载的标志(如果有) next_url = utils.remove_url_query(url) while next_url: @@ -109,7 +109,7 @@ def fetch_movies_by_maker(): list_data, next_url = scraper.parse_maker_detail(soup, next_url) if list_data: for movie in list_data: - tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id) + tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id, uncensored=uncensored) if tmp_id: logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}') else: @@ -127,16 +127,18 @@ def fetch_movies_by_maker(): # 更新series列表中的影片信息 def fetch_movies_by_series(): - if fast_mode: - url_list = db_tools.query_series_hrefs(from_list=1) - else: - url_list = db_tools.query_series_hrefs() - if debug: url_list = db_tools.query_series_hrefs(name='10musume') + else: + if fast_mode: + url_list = db_tools.query_series_hrefs(from_list=1) + else: + url_list = db_tools.query_series_hrefs() + for row in url_list: url = row['href'] row_id = row['id'] + uncensored = row['from_list'] if row['from_list'] > 0 else None # 去掉可下载的标志(如果有) next_url = utils.remove_url_query(url) while next_url: @@ -146,7 +148,7 @@ def fetch_movies_by_series(): list_data, next_url = scraper.parse_series_detail(soup, next_url) if list_data: for movie in list_data: - tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id) + tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id, uncensored=uncensored) if tmp_id: logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}') else: @@ -163,13 +165,14 @@ def fetch_movies_by_series(): # 更新series列表中的影片信息 def fetch_movies_by_publishers(): - if fast_mode: - url_list = db_tools.query_publishers_hrefs(from_list=1) - else: - url_list = db_tools.query_publishers_hrefs() - if debug: url_list = db_tools.query_publishers_hrefs(limit=1) + else: + if fast_mode: + url_list = db_tools.query_publishers_hrefs(from_list=1) + else: + url_list = db_tools.query_publishers_hrefs() + for row in url_list: url = row['href'] row_id = row['id'] @@ -201,30 +204,34 @@ def fetch_movies_by_publishers(): # 更新演员信息 def fetch_performers_detail(): limit_count = 5 if debug else 100 - perfomers_list = [] - last_perfomer_id = 0 + performers_list = [] + last_performer_id = 0 abnormal_codes = [scraper.http_code_404, scraper.http_code_login] + + def get_performers(**kwargs): + if fast_mode: + kwargs["from_actor_list"] = 1 + kwargs["order_by"] = 'id asc' + return db_tools.query_actors(limit=limit_count, **kwargs) + while True: - # 每次从数据库中取一部分,避免一次全量获取 - if force: # 从头逐个遍历 - if from_actor: - if abnormal_only: - perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) - else: - perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) - else: - if abnormal_only: - perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) - else: - perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) - else: # 只做更新 - perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count) - if len(perfomers_list) < 1: + if update_mode == 0: # 只遍历新纪录 + performers_list = get_performers(start_id=0, is_full_data=0) + elif update_mode == 1: # 只遍历完整纪录 + performers_list = get_performers(start_id=last_performer_id, is_full_data=1) + elif update_mode == 2: # 0+1 + performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes) + elif update_mode == 3: # 其他 + performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes) + else: # 全部 + performers_list = get_performers(start_id=last_performer_id) + + if len(performers_list) < 1: logging.info(f'all performers fetched.') break succ_rows = 0 - for performer in perfomers_list: + for performer in performers_list: url = performer['href'] person = performer['name'] pic = '' @@ -249,7 +256,7 @@ def fetch_performers_detail(): need_insert = False break elif status_code and status_code == scraper.http_code_login: - actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login) + actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login) logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break @@ -270,13 +277,13 @@ def fetch_performers_detail(): }) if performer_id: logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') - last_perfomer_id = performer_id + last_performer_id = performer_id succ_rows += 1 else: logging.warning(f'insert person: ({person}) {url} failed.') time.sleep(0.5) - logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}') + logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}') # 调试break if debug: return True @@ -287,23 +294,29 @@ def fetch_movies_detail(): movies_list = [] last_movie_id = 0 abnormal_codes = [scraper.http_code_404, scraper.http_code_login] - while True: - if force: # 从头逐个遍历 - if from_actor: - if abnormal_only: - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) - else: - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1) - else: - if abnormal_only: - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) - else: - movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0) - else: # 只做更新 - movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) + + def get_movies(**kwargs): + if fast_mode: + kwargs["uncensored"] = 1 + kwargs["order_by"] = 'id asc' + return db_tools.query_movie_hrefs(limit=limit_count, **kwargs) + + while True: + if update_mode == 0: # 只遍历新纪录 + movies_list = get_movies(start_id=0, is_full_data=0) + elif update_mode == 1: # 只遍历完整纪录 + movies_list = get_movies(start_id=last_movie_id, is_full_data=1) + elif update_mode == 2: # 0+1 + movies_list = get_movies(start_id=last_movie_id, is_full_data_not_in=abnormal_codes) + elif update_mode == 3: # 其他 + movies_list = get_movies(start_id=last_movie_id, is_full_data_in =abnormal_codes) + else: # 全部 + movies_list = get_movies(start_id=last_movie_id) + if len(movies_list) < 1: - logging.info(f'all movies fetched.') + logging.info(f'all performers fetched.') break + succ_count = 0 for movie in movies_list: url = movie['href'] @@ -399,33 +412,26 @@ def set_env(args): logger = logging.getLogger() logger.setLevel(logging.DEBUG) - global force - force = args.force - global skip_local skip_local = args.skip_local - global from_actor - from_actor = args.from_actor - - global abnormal_only - abnormal_only = args.abnormal_only - global fast_mode fast_mode = args.fast_mode + global update_mode + if args.update: + update_mode = args.update + if __name__ == "__main__": # 命令行参数处理 keys_str = ",".join(function_map.keys()) parser = argparse.ArgumentParser(description='fetch javdb data.') parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") + parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0, 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有') + parser.add_argument('--fast_mode', action='store_true', help='只遍历所有 uncensored 的 makers/series/actors/movies') + parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作') parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') - parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)') - parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)') - parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)') - parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)') - parser.add_argument('--fast_mode', action='store_true', help='只遍历所有 uncensored 的 makers 和 series ') args = parser.parse_args() set_env(args) diff --git a/javdb/src/sqlite_utils.py b/javdb/src/sqlite_utils.py index 36f414a..ec518d9 100644 --- a/javdb/src/sqlite_utils.py +++ b/javdb/src/sqlite_utils.py @@ -64,7 +64,7 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None): logging.error(f"未知错误: {e}") return None -def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None, from_movie_publishers=None, pub_id=None): +def insert_movie_index_old(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None, from_movie_publishers=None, pub_id=None): try: # **先检查数据库中是否已有该电影** cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id FROM javdb_movies WHERE href = ?", (href,)) @@ -114,6 +114,59 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None logging.error(f"Error inserting/updating movie: {e}") return None +def insert_movie_index(title, href, **kwargs): + try: + # 先检查数据库中是否已有该电影 + cursor.execute("SELECT * FROM javdb_movies WHERE href = ?", (href,)) + existing_movie = cursor.fetchone() + + # 获取列名 + column_names = [description[0] for description in cursor.description] + + fields = [ + 'from_actor_list', 'from_movie_makers', 'from_movie_series', 'from_movie_publishers', + 'maker_id', 'series_id', 'pub_id', 'uncensored' + ] + + if existing_movie: # 如果电影已存在 + existing_values = dict(zip(column_names, existing_movie)) + movie_id = existing_values['id'] + logging.debug(f"values in db: {existing_values}") + + # 如果没有传入值,就用原来的值 + for field in fields: + kwargs[field] = kwargs.get(field) if kwargs.get(field) is not None else existing_values[field] + + set_clauses = ", ".join([f"{field} = ?" for field in fields]) + sql = f""" + UPDATE javdb_movies + SET title = ?, {set_clauses}, updated_at = datetime('now', 'localtime') + WHERE href = ? + """ + values = [title] + [kwargs[field] for field in fields] + [href] + logging.debug(f"sql: {sql}, values: {values}") + cursor.execute(sql, values) + else: # 如果电影不存在,插入 + columns = ', '.join(['title', 'href'] + fields) + placeholders = ', '.join(['?'] * (len(fields) + 2)) + sql = f"INSERT INTO javdb_movies ({columns}) VALUES ({placeholders})" + values = [title, href] + [kwargs.get(field, 0) for field in fields] + logging.debug(f"sql: {sql}, values: {values}") + cursor.execute(sql, values) + + conn.commit() + + movie_id = get_id_by_href('javdb_movies', href) + if movie_id: + logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}') + + return movie_id + + except Exception as e: + conn.rollback() + logging.error(f"Error inserting/updating movie: {e}") + return None + # 插入演员和电影的关联数据 def insert_actor_movie(performer_id, movie_id, tags=''): @@ -145,10 +198,11 @@ def insert_or_update_actor(actor): ON CONFLICT(href) DO UPDATE SET name=excluded.name, pic=excluded.pic, is_full_data=1, updated_at=datetime('now', 'localtime') ''', (actor['name'], actor['href'], actor['pic'])) - cursor.execute('SELECT id FROM javdb_actors WHERE href = ?', (actor['href'],)) conn.commit() - actor_id = get_id_by_href('javdb_actors', actor['href']) + # 查询刚插入的数据 + cursor.execute('SELECT id, from_actor_list FROM javdb_actors WHERE href = ?', (actor['href'],)) + actor_id, uncensored = cursor.fetchone() if actor_id is None: logging.warning(f'insert data error. name: {actor['name']}, href: {actor['href']}') return None @@ -166,10 +220,10 @@ def insert_or_update_actor(actor): # 插入影片列表 for movie in actor.get("credits") or []: - movie_id = get_id_by_href('javdb_movies', movie['href']) - # 影片不存在,先插入 - if movie_id is None: - # TODO: from_actor_list 只标记无码女优的话,这里要修改,暂时不动 + # from_actor_list = 1 表示无码影星的,其他不处理 + if uncensored and uncensored > 0: + movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1, uncensored=uncensored) + else: movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1) if movie_id: tmp_id = insert_actor_movie(actor_id, movie_id) @@ -227,47 +281,40 @@ def query_actors(**filters): sql = "SELECT href, name FROM javdb_actors WHERE 1=1" params = [] - if "id" in filters: - sql += " AND id = ?" - params.append(filters["id"]) - if "href" in filters: - sql += " AND href = ?" - params.append(filters["href"]) - if "name" in filters: - sql += " AND name LIKE ?" - params.append(f"%{filters['name']}%") - if "is_full_data" in filters: - sql += " AND is_full_data = ?" - params.append(filters["is_full_data"]) - if "from_actor_list" in filters: - sql += " AND from_actor_list = ?" - params.append(filters["from_actor_list"]) - if "is_full_data_in" in filters: - values = filters["is_full_data_in"] - if values: - placeholders = ", ".join(["?"] * len(values)) - sql += f" AND is_full_data IN ({placeholders})" - params.extend(values) - if "is_full_data_not_in" in filters: - values = filters["is_full_data_not_in"] - if values: - placeholders = ", ".join(["?"] * len(values)) - sql += f" AND is_full_data NOT IN ({placeholders})" - params.extend(values) - if "before_updated_at" in filters: - sql += " AND updated_at <= ?" - params.append(filters["before_updated_at"]) - if "after_updated_at" in filters: - sql += " AND updated_at >= ?" - params.append(filters["after_updated_at"]) - if "start_id" in filters: - sql += " AND id > ?" - params.append(filters["start_id"]) + conditions = { + "id": " AND id = ?", + "href": " AND href = ?", + "name": " AND name LIKE ?", + "is_full_data": " AND is_full_data = ?", + "from_actor_list": " AND from_actor_list = ?", + "before_updated_at": " AND updated_at <= ?", + "after_updated_at": " AND updated_at >= ?", + "start_id": " AND id > ?", + } + + for key, condition in conditions.items(): + if key in filters: + sql += condition + if key == "name": + params.append(f"%{filters[key]}%") + else: + params.append(filters[key]) + + for key in ["is_full_data_in", "is_full_data_not_in"]: + if key in filters: + values = filters[key] + if values: + placeholders = ", ".join(["?"] * len(values)) + operator = "IN" if key == "is_full_data_in" else "NOT IN" + sql += f" AND is_full_data {operator} ({placeholders})" + params.extend(values) + if "order_by" in filters: - sql += " order by ? asc" - params.append(filters["order_by"]) + # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 + sql += f" ORDER BY {filters['order_by']} " + if 'limit' in filters: - sql += " limit ?" + sql += " LIMIT ?" params.append(filters["limit"]) cursor.execute(sql, params) @@ -353,7 +400,7 @@ def query_maker(identifier): # 按条件查询 href 列表 def query_maker_hrefs(**filters): try: - sql = "SELECT href, id FROM javdb_makers WHERE 1=1" + sql = "SELECT href, id, from_list FROM javdb_makers WHERE 1=1" params = [] if "id" in filters: @@ -368,10 +415,13 @@ def query_maker_hrefs(**filters): if "name" in filters: sql += " AND name LIKE ?" params.append(f"%{filters['name']}%") + if 'limit' in filters: + sql += " limit ?" + params.append(filters["limit"]) cursor.execute(sql, params) #return [row[0] for row in cursor.fetchall()] # 链接使用小写 - return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()] + return [{'href': row[0], 'id': row[1], 'from_list':row[2]} for row in cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -451,7 +501,7 @@ def query_series(identifier): # 按条件查询 href 列表 def query_series_hrefs(**filters): try: - sql = "SELECT href, id FROM javdb_series WHERE 1=1" + sql = "SELECT href, id, from_list FROM javdb_series WHERE 1=1" params = [] if "id" in filters: @@ -466,10 +516,14 @@ def query_series_hrefs(**filters): if "name" in filters: sql += " AND name LIKE ?" params.append(f"%{filters['name']}%") + if 'limit' in filters: + sql += " limit ?" + params.append(filters["limit"]) cursor.execute(sql, params) #return [row[0] for row in cursor.fetchall()] # 链接使用小写 - return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()] + #return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()] + return [{'href': row[0], 'id': row[1], 'from_list':row[2]} for row in cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -795,7 +849,7 @@ def query_movies(identifier): return None # 按条件查询 href 列表 -def query_movie_hrefs(**filters): +def query_movie_hrefs_old(**filters): try: sql = "SELECT href, title, id FROM javdb_movies WHERE 1=1" params = [] @@ -850,7 +904,58 @@ def query_movie_hrefs(**filters): except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") return [] - + +# 查询 +def query_movie_hrefs(**filters): + try: + sql = "SELECT href, name FROM javdb_movies WHERE 1=1" + params = [] + + conditions = { + "id": " AND id = ?", + "href": " AND href = ?", + "title": " AND title LIKE ?", + "is_full_data": " AND is_full_data = ?", + "uncensored": " AND uncensored = ?", + "from_actor_list": " AND from_actor_list = ?", + "before_updated_at": " AND updated_at <= ?", + "after_updated_at": " AND updated_at >= ?", + "start_id": " AND id > ?", + } + + for key, condition in conditions.items(): + if key in filters: + sql += condition + if key == "title": + params.append(f"%{filters[key]}%") + else: + params.append(filters[key]) + + for key in ["is_full_data_in", "is_full_data_not_in"]: + if key in filters: + values = filters[key] + if values: + placeholders = ", ".join(["?"] * len(values)) + operator = "IN" if key == "is_full_data_in" else "NOT IN" + sql += f" AND is_full_data {operator} ({placeholders})" + params.extend(values) + + if "order_by" in filters: + # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 + sql += f" ORDER BY {filters['order_by']} " + + if 'limit' in filters: + sql += " LIMIT ?" + params.append(filters["limit"]) + + cursor.execute(sql, params) + #return [row[0].lower() for row in cursor.fetchall()] # 返回小写 + return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()] + + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + # 插入一条任务日志 def insert_task_log(): try: