diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index 42f8bce..9d3db84 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -245,7 +245,7 @@ def fetch_performers_detail_once(perfomers_list): logging.debug(f"Fetching data for performer ({person}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) if soup: - data = scraper.parse_page_performer(soup) + data = scraper.parse_page_performer(soup, url) if data: performer_id = db_tools.insert_or_update_performer({ 'href': url, @@ -281,10 +281,14 @@ def fetch_performers_detail_once(perfomers_list): def fetch_performers_detail(): limit_count = 5 if debug else 100 perfomers_list = [] + last_perfomer_id = 0 # 获取新演员的列表 - while True: - perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count) + while True: + if force: # 从头逐个遍历 + perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, order_by='id asc', limit=limit_count) + else: # 只做更新 + perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count) if len(perfomers_list) < 1: logging.info(f'all new performers fetched. ') break @@ -308,12 +312,15 @@ def fetch_performers_detail(): def fetch_movies_detail(): limit_count = 10 if debug else 100 movies_list = [] + last_movie_id = 0 while True: - movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) + if force: # 从头逐个遍历 + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, order_by='id asc', limit=limit_count) + else: # 只做更新 + movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) if len(movies_list) < 1: logging.info(f'all movies fetched.') break - last_movie_id = 0 succ_count = 0 for movie in movies_list: url = movie['href'] diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py index bcedf19..e8c3d2f 100644 --- a/iafd/src/iafd_scraper.py +++ b/iafd/src/iafd_scraper.py @@ -343,7 +343,7 @@ def parse_credits_table(table, distributor_list): # 请求网页并提取所需数据 -def parse_page_performer(soup): +def parse_page_performer(soup, url): # 提取数据 data = {} @@ -434,13 +434,22 @@ def parse_page_movie(soup, href, title): values = info_div.find_all("p", class_="biodata") for label, value in zip(labels, values): key = label.text.strip() - val = value.text.strip() - if key in ["Distributor", "Studio", "Director"]: - link = value.find("a") - if link: - val = link.text.strip() - movie_data[f'{key}Href'] = host_url + link['href'] - movie_data[key] = val + if key == "Directors": # 解析多位导演的情况 + directors = [] + links = value.find_all("a") + for link in links: + director_name = link.text.strip() + director_href = host_url + link['href'] if link['href'] else '' + directors.append({"name": director_name, "href": director_href}) + movie_data[key] = directors + else: + val = value.text.strip() + if key in ["Distributor", "Studio", "Director"]: + link = value.find("a") + if link: + val = link.text.strip() + movie_data[f'{key}Href'] = host_url + link['href'] + movie_data[key] = val else: return None @@ -541,6 +550,7 @@ def parse_page_movie(soup, href, title): "DirectorHref": movie_data.get("DirectorHref", ""), "DistributorHref": movie_data.get("DistributorHref", ""), "StudioHref": movie_data.get("StudioHref", ""), + "Directors": movie_data.get("Directors", []), # 可能存在的元素 "Performers": performers, "SceneBreakdowns": scene_breakdowns, "AppearsIn": appears_in, diff --git a/iafd/src/sqlite_utils.py b/iafd/src/sqlite_utils.py index acd3161..be8f13e 100644 --- a/iafd/src/sqlite_utils.py +++ b/iafd/src/sqlite_utils.py @@ -344,6 +344,12 @@ def query_performer_hrefs(**filters): if "is_full_data" in filters: sql += " AND is_full_data = ?" params.append(filters["is_full_data"]) + if "start_id" in filters: + sql += " AND id > ?" + params.append(filters["start_id"]) + if "order_by" in filters: + sql += " order by ? asc" + params.append(filters["order_by"]) if 'limit' in filters: sql += " limit ?" params.append(filters["limit"]) @@ -572,7 +578,7 @@ def insert_or_update_movie(movie_data): studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref']) director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref']) # 导演不存在的话,插入一条 - if director_id is None: + if (director_id is None) and utils.is_valid_person_url(movie_data['DirectorHref']): director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'], from_movie_list=1) if studio_id is None: studio_id = 0 @@ -605,6 +611,22 @@ def insert_or_update_movie(movie_data): logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}') + # 导演-电影写入 关系表 + if director_id: + tmp_id = insert_performer_movie(director_id, movie_id, 'directoral', '') + if tmp_id: + logging.debug(f"insert one perfomer_movie. director_id: {director_id}, movie_id:{movie_id}") + for director in movie_data.get('Directors', []): + director_id = get_id_by_href('iafd_performers', director['href']) + # 如果演员不存在,先插入 + if (director_id is None) and utils.is_valid_person_url(director['href']): + director_id = insert_performer_index(director['name'], director['href'], from_movie_list=1) + logging.debug(f"insert one director. perfomer_id: {director_id}, movie_id:{movie_id} ") + if director_id: + tmp_id = insert_performer_movie(director_id, movie_id, 'directoral', '') + if tmp_id: + logging.debug(f"insert one perfomer_movie. director_id: {director_id}, movie_id:{movie_id}") + # 插入 performers_movies 关系表 for performer in movie_data.get('Performers', []): performer_id = get_id_by_href('iafd_performers', performer['href']) @@ -732,6 +754,12 @@ def query_movie_hrefs(**filters): if "is_full_data" in filters: sql += " AND is_full_data = ?" params.append(filters["is_full_data"]) + if "start_id" in filters: + sql += " AND id > ?" + params.append(filters["start_id"]) + if "order_by" in filters: + sql += " order by ?" + params.append(filters["order_by"]) if 'limit' in filters: sql += " limit ?" params.append(filters["limit"]) @@ -762,7 +790,7 @@ def get_performers_needed_update(limit=None): return [] # 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。 -def create_view_and_indexes(): +def check_and_create_stat_table(taskid = 0): try: # 检查索引是否存在,如果不存在则创建 indexes = [ @@ -782,56 +810,57 @@ def create_view_and_indexes(): logging.info(f"Index {index_name} already exists.") # 检查视图是否存在,如果不存在则创建 - view_name = "view_perfomers_cnt" - cursor.execute("SELECT name FROM sqlite_master WHERE type='view' AND name=?", (view_name,)) - if not cursor.fetchone(): - create_view_sql = """ - CREATE VIEW view_perfomers_cnt AS + view_name = f"iafd_tmp_performers_stat_{taskid}" + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,)) + if cursor.fetchone(): + cursor.execute("drop table ?", (view_name,)) + conn.commit() + + create_view_sql = f""" + CREATE table {view_name} AS + SELECT + id, + href, + name, + movies_cnt, + SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count, + SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count + FROM ( SELECT - id, - href, - name, - movies_cnt, - SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count, - SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count - FROM ( - SELECT - p.id, - p.href, - p.name, - p.movies_cnt, - COUNT(apm.movie_id) AS movie_count, - 'actor' AS role - FROM - iafd_performers p - LEFT JOIN - iafd_performers_movies apm ON p.id = apm.performer_id - GROUP BY - p.id, p.href, p.name, p.movies_cnt - - UNION ALL - - SELECT - p.id, - p.href, - p.name, - p.movies_cnt, - COUNT(im.id) AS movie_count, - 'director' AS role - FROM - iafd_performers p - LEFT JOIN - iafd_movies im ON p.id = im.director_id - GROUP BY - p.id, p.href, p.name, p.movies_cnt - ) combined + p.id, + p.href, + p.name, + p.movies_cnt, + COUNT(apm.movie_id) AS movie_count, + 'actor' AS role + FROM + iafd_performers p + LEFT JOIN + iafd_performers_movies apm ON p.id = apm.performer_id GROUP BY - id, href, name, movies_cnt; - """ - cursor.execute(create_view_sql) - logging.info(f"View {view_name} created successfully.") - else: - logging.info(f"View {view_name} already exists.") + p.id, p.href, p.name, p.movies_cnt + + UNION ALL + + SELECT + p.id, + p.href, + p.name, + p.movies_cnt, + COUNT(im.id) AS movie_count, + 'director' AS role + FROM + iafd_performers p + LEFT JOIN + iafd_movies im ON p.id = im.director_id + GROUP BY + p.id, p.href, p.name, p.movies_cnt + ) combined + GROUP BY + id, href, name, movies_cnt; + """ + cursor.execute(create_view_sql) + logging.info(f"table {view_name} created successfully.") # 提交更改并关闭连接 conn.commit() @@ -913,7 +942,7 @@ def finalize_task_log(task_id): logging.error(f"任务 {task_id} 结束失败: {e}") if __name__ == "__main__": - create_view_and_indexes() + check_and_create_stat_table() ''' try: diff --git a/iafd/src/utils.py b/iafd/src/utils.py index 9c2fbdd..6bad9f6 100644 --- a/iafd/src/utils.py +++ b/iafd/src/utils.py @@ -32,6 +32,11 @@ def to_number(value): except (ValueError, TypeError): return 0 +def is_valid_person_url(url): + if 'person.rme' in url.lower(): + return True + return False + def dist_stu_href_rewrite(href): # 提取 ID(适用于 distrib 或 studio) import re