diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index c8a4249..e6940e5 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -244,6 +244,7 @@ def fetch_performers_detail_once(perfomers_list): url = performer['href'] person = performer['name'] curr_id = performer['id'] + movies_cnt = performer['movies_cnt'] logging.debug(f"Fetching data for performer ({person}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) # 从本地读取的文件,忽略 @@ -253,6 +254,14 @@ def fetch_performers_detail_once(perfomers_list): if soup: data = scraper.parse_page_performer(soup, url) if data: + # 检查影片数量是否有更新 + page_movies_cnt = int(data.get('movies_cnt', '0')) + if page_movies_cnt <= movies_cnt: + if not force: + logging.info(f"actor already update. skipping... person: ({person}), url: {url}") + last_performer_id = curr_id + continue + performer_id = db_tools.insert_or_update_performer({ 'href': url, 'person': person, @@ -377,6 +386,8 @@ def fetch_movies_detail(): if debug: return True +def reset_actor_movie_cnt(): + db_tools.reset_actor_movies() # 建立缩写到函数的映射 function_map = { @@ -387,6 +398,7 @@ function_map = { "stu" : fetch_movies_by_stu, "performers": fetch_performers_detail, "movies" : fetch_movies_detail, + "reset_mv" : reset_actor_movie_cnt, } # 主函数 diff --git a/iafd/src/sqlite_utils.py b/iafd/src/sqlite_utils.py index 180f55f..ca758cb 100644 --- a/iafd/src/sqlite_utils.py +++ b/iafd/src/sqlite_utils.py @@ -329,7 +329,7 @@ def query_performer(identifier): # 按条件查询 href 列表 def query_performer_hrefs(**filters): try: - sql = "SELECT href, name, id FROM iafd_performers WHERE 1=1" + sql = "SELECT href, name, id, movies_cnt FROM iafd_performers WHERE 1=1" params = [] if "id" in filters: @@ -375,7 +375,7 @@ def query_performer_hrefs(**filters): logging.debug(f"query sql: {sql}") cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 返回小写 - return [{'href': row[0], 'name': row[1], 'id':row[2]} for row in cursor.fetchall()] + return [{'href': row[0], 'name': row[1], 'id':row[2], 'movies_cnt':row[3]} for row in cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -905,6 +905,62 @@ def check_and_create_stat_table(taskid = 0): logging.warning(f"An error occurred: {e}") +# 处理影片的 无码 字段 +def reset_actor_movies(check_and_do = 0): + try: + # 检查表中是否已存在movies_cnt列 + cursor.execute(f"PRAGMA table_info(iafd_performers);") + columns = [row[1] for row in cursor.fetchall()] + + if 'movies_cnt' not in columns: + # 列不存在,添加新列 + add_field_sql = f""" + ALTER TABLE iafd_performers ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL; + """ + cursor.execute(add_field_sql) + logging.info("成功添加movies_cnt字段") + else: + logging.info("movies_cnt字段已存在,跳过添加") + + # 确保关联表有索引 + cursor.execute(f""" + CREATE INDEX IF NOT EXISTS idx_iafd_performers_movies_performer_id + ON iafd_performers_movies(performer_id); + """) + + # 创建临时表存储统计结果 + cursor.execute(f""" + CREATE TEMPORARY TABLE temp_actor_counts AS + SELECT performer_id, COUNT(movie_id) AS cnt + FROM iafd_performers_movies + GROUP BY performer_id; + """) + + # 为临时表添加索引 + cursor.execute("CREATE INDEX idx_temp_performer_id ON temp_actor_counts(performer_id);") + + # 更新主表 + cursor.execute(f""" + UPDATE iafd_performers + SET movies_cnt = COALESCE(( + SELECT cnt FROM temp_actor_counts + WHERE performer_id = iafd_performers.id + ), 0); -- 使用COALESCE处理没有影片的演员 + """) + + updated_rows = cursor.rowcount + logging.info(f"成功更新{updated_rows}个演员的影片数量") + + # 清理资源 + cursor.execute("DROP TABLE IF EXISTS temp_actor_counts;") + conn.commit() + + logging.info("任务执行完成!") + + except sqlite3.Error as e: + conn.rollback() + logging.error("Error updating actor movie_cnt: %s", e) + # 插入一条任务日志 def insert_task_log(): diff --git a/src/crawling/craw.py b/src/crawling/craw.py index 93efa32..9265b1a 100644 --- a/src/crawling/craw.py +++ b/src/crawling/craw.py @@ -178,6 +178,7 @@ class JavbusCrawler(GenericCrawler): """ result = { 'avatar': {}, + 'title' : {}, 'movies': [] } @@ -189,6 +190,9 @@ class JavbusCrawler(GenericCrawler): else: logging.debug(f"avatar-box not found. href: {href}") + # 解析页面上的title,获取影片数量等信息 + result['title'] = self.parse_title_info(soup, href) + # 解析影片列表 movie_boxes = soup.find_all('a', class_='movie-box') if movie_boxes: @@ -336,17 +340,9 @@ class JavbusCrawler(GenericCrawler): return movie_info - - # 获取演员详情 - def parse_studios_labels_series_detail(self, soup, href): - """ - 解析Javbus网页内容,提取演员信息和影片列表 - """ - result = { - 'meta': {}, - 'movies': [] - } - + # 获取页面头部的信息 + def parse_title_info(self, soup, href): + title_info = {} try: # 解析标题 b_tag = soup.select_one('.alert.alert-success.alert-common p b') @@ -367,8 +363,8 @@ class JavbusCrawler(GenericCrawler): # 提取前两个元素作为工作室和角色 studio = parts[video_index - 2] role = parts[video_index - 1] - result['meta']['title'] = studio - result['meta']['role'] = role + title_info['title'] = studio + title_info['role'] = role else: logging.debug(f"无法按规则解析: {' - '.join(parts)}") @@ -384,13 +380,31 @@ class JavbusCrawler(GenericCrawler): if '全部影片' in text: match = re.search(r'全部影片\s*(\d+)\s*', text) if match: - result['meta']['movies_cnt'] = int(match.group(1)) + title_info['movies_cnt'] = int(match.group(1)) # 提取已有磁力数量 if '已有磁力' in text: match = re.search(r'已有磁力\s*(\d+)\s*', text) if match: - result['meta']['magnet_cnt'] = int(match.group(1)) + title_info['magnet_cnt'] = int(match.group(1)) + except Exception as e: + logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True) + + return title_info + + # 获取演员详情 + def parse_studios_labels_series_detail(self, soup, href): + """ + 解析Javbus网页内容,提取演员信息和影片列表 + """ + result = { + 'meta': {}, + 'movies': [] + } + + try: + # 解析标题 + result['meta'] = self.parse_title_info(soup, href) div_waterfall = soup.find('div', id='waterfall') if not div_waterfall: diff --git a/src/db_utils/sqlite_db.py b/src/db_utils/sqlite_db.py index 9ce1da8..889ada2 100644 --- a/src/db_utils/sqlite_db.py +++ b/src/db_utils/sqlite_db.py @@ -281,7 +281,7 @@ class JavbusDBHandler(DatabaseHandler): def query_actors(self, **filters): try: - sql = f"SELECT href, en_name as name, uncensored FROM {self.tbl_name_actors} WHERE 1=1" + sql = f"SELECT href, en_name as name, uncensored, movies_cnt FROM {self.tbl_name_actors} WHERE 1=1" params = [] conditions = { @@ -319,7 +319,7 @@ class JavbusDBHandler(DatabaseHandler): params.append(filters["limit"]) self.cursor.execute(sql, params) - return [{'href': row[0], 'name': row[1], 'uncensored': row[2]} for row in self.cursor.fetchall()] + return [{'href': row[0], 'name': row[1], 'uncensored': row[2], 'movies_cnt':row[3]} for row in self.cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") return None @@ -694,3 +694,55 @@ class JavbusDBHandler(DatabaseHandler): self.conn.rollback() logging.error("Error inserting movie: %s", e) logging.error(f"query error: {e}") + + # 处理影片的 无码 字段 + def reset_actor_movies(self, check_and_do = 0): + try: + # 检查表中是否已存在movies_cnt列 + self.cursor.execute(f"PRAGMA table_info({self.tbl_name_actors});") + columns = [row[1] for row in self.cursor.fetchall()] + + if 'movies_cnt' not in columns: + # 列不存在,添加新列 + add_field_sql = f""" + ALTER TABLE {self.tbl_name_actors} ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL; + """ + self.cursor.execute(add_field_sql) + logging.info("成功添加movies_cnt字段") + else: + logging.info("movies_cnt字段已存在,跳过添加") + + # 确保关联表有索引 + self.cursor.execute(f""" + CREATE INDEX IF NOT EXISTS idx_actor_movie_actor_id + ON {self.tbl_name_actor_movie}(actor_id); + """) + + # 创建临时表存储统计结果 + self.cursor.execute(f""" + CREATE TEMPORARY TABLE temp_actor_counts AS + SELECT actor_id, COUNT(movie_id) AS cnt + FROM {self.tbl_name_actor_movie} + GROUP BY actor_id; + """) + + # 为临时表添加索引 + self.cursor.execute("CREATE INDEX idx_temp_actor_id ON temp_actor_counts(actor_id);") + + # 更新主表 + self.cursor.execute(f""" + UPDATE {self.tbl_name_actors} + SET movies_cnt = COALESCE(( + SELECT cnt FROM temp_actor_counts + WHERE actor_id = {self.tbl_name_actors}.id + ), 0); -- 使用COALESCE处理没有影片的演员 + """) + updated_rows = self.cursor.rowcount + logging.info(f"成功更新{updated_rows}个演员的影片数量") + + self.conn.commit() + logging.info("任务执行完成!") + + except sqlite3.Error as e: + self.conn.rollback() + logging.error("Error updating actor movie_cnt: %s", e) diff --git a/src/javbus/fetch.py b/src/javbus/fetch.py index 26bab85..042b71d 100644 --- a/src/javbus/fetch.py +++ b/src/javbus/fetch.py @@ -18,6 +18,7 @@ scraper = craw.JavbusCrawler() debug = False skip_local = False +g_force = False g_uncensored = 0 update_mode = 0 @@ -313,6 +314,7 @@ def fetch_performers_detail(): url = performer['href'] person = performer['name'] uncensored = int(performer['uncensored']) + movies_cnt = int(performer['movies_cnt']) avatar = None if not utils.is_valid_url(url): actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404}) @@ -328,8 +330,18 @@ def fetch_performers_detail(): if soup: data, next_url = scraper.parse_actor_detail(soup, next_url) if data: + # 首页,获取影片总数量,如果与数据库中相同,则跳过 if not avatar: avatar = data.get('avatar') + page_title = data.get('title', {}) + page_movies_cnt = int(page_title.get('movies_cnt', '0')) + if page_movies_cnt <= movies_cnt: + # 相当于已经全部获取过了,没有要更新的。 + need_insert = False if not g_force else True + logging.info(f"actor already update. insert flag: {need_insert}. person: ({person}), url: {url}") + if not need_insert: + break + all_movies.extend(data.get('movies', [])) else: logging.warning(f"fetch_page error. url: {url}") @@ -455,6 +467,7 @@ def fetch_movies_detail(): # 重置 movies 表的 uncensored 标志位 def reset_movies_uncensored(): db_tools.reset_movies_uncensored(check_and_do=0 if debug else 1) + db_tools.reset_actor_movies() # 建立缩写到函数的映射 function_map = { @@ -523,6 +536,9 @@ def set_env(args): if args.update: update_mode = args.update + global g_force + g_force = args.force + if __name__ == "__main__": # 命令行参数处理 keys_str = ",".join(function_map.keys()) @@ -548,6 +564,7 @@ if __name__ == "__main__": parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有') parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量') parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作') + parser.add_argument('--force', action='store_true', help='Enable force update mode (limit records)') parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') args = parser.parse_args()