modify scripts

This commit is contained in:
oscarz
2025-04-05 11:34:40 +08:00
parent 26ad0d7935
commit 37fef7c786
4 changed files with 115 additions and 64 deletions

View File

@ -245,7 +245,7 @@ def fetch_performers_detail_once(perfomers_list):
logging.debug(f"Fetching data for performer ({person}), url {url} ...") logging.debug(f"Fetching data for performer ({person}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
if soup: if soup:
data = scraper.parse_page_performer(soup) data = scraper.parse_page_performer(soup, url)
if data: if data:
performer_id = db_tools.insert_or_update_performer({ performer_id = db_tools.insert_or_update_performer({
'href': url, 'href': url,
@ -281,10 +281,14 @@ def fetch_performers_detail_once(perfomers_list):
def fetch_performers_detail(): def fetch_performers_detail():
limit_count = 5 if debug else 100 limit_count = 5 if debug else 100
perfomers_list = [] perfomers_list = []
last_perfomer_id = 0
# 获取新演员的列表 # 获取新演员的列表
while True: while True:
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count) if force: # 从头逐个遍历
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, order_by='id asc', limit=limit_count)
else: # 只做更新
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
if len(perfomers_list) < 1: if len(perfomers_list) < 1:
logging.info(f'all new performers fetched. ') logging.info(f'all new performers fetched. ')
break break
@ -308,12 +312,15 @@ def fetch_performers_detail():
def fetch_movies_detail(): def fetch_movies_detail():
limit_count = 10 if debug else 100 limit_count = 10 if debug else 100
movies_list = [] movies_list = []
last_movie_id = 0
while True: while True:
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) if force: # 从头逐个遍历
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, order_by='id asc', limit=limit_count)
else: # 只做更新
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
if len(movies_list) < 1: if len(movies_list) < 1:
logging.info(f'all movies fetched.') logging.info(f'all movies fetched.')
break break
last_movie_id = 0
succ_count = 0 succ_count = 0
for movie in movies_list: for movie in movies_list:
url = movie['href'] url = movie['href']

View File

@ -343,7 +343,7 @@ def parse_credits_table(table, distributor_list):
# 请求网页并提取所需数据 # 请求网页并提取所需数据
def parse_page_performer(soup): def parse_page_performer(soup, url):
# 提取数据 # 提取数据
data = {} data = {}
@ -434,13 +434,22 @@ def parse_page_movie(soup, href, title):
values = info_div.find_all("p", class_="biodata") values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values): for label, value in zip(labels, values):
key = label.text.strip() key = label.text.strip()
val = value.text.strip() if key == "Directors": # 解析多位导演的情况
if key in ["Distributor", "Studio", "Director"]: directors = []
link = value.find("a") links = value.find_all("a")
if link: for link in links:
val = link.text.strip() director_name = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href'] director_href = host_url + link['href'] if link['href'] else ''
movie_data[key] = val directors.append({"name": director_name, "href": director_href})
movie_data[key] = directors
else:
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else: else:
return None return None
@ -541,6 +550,7 @@ def parse_page_movie(soup, href, title):
"DirectorHref": movie_data.get("DirectorHref", ""), "DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""), "DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""), "StudioHref": movie_data.get("StudioHref", ""),
"Directors": movie_data.get("Directors", []), # 可能存在的元素
"Performers": performers, "Performers": performers,
"SceneBreakdowns": scene_breakdowns, "SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in, "AppearsIn": appears_in,

View File

@ -344,6 +344,12 @@ def query_performer_hrefs(**filters):
if "is_full_data" in filters: if "is_full_data" in filters:
sql += " AND is_full_data = ?" sql += " AND is_full_data = ?"
params.append(filters["is_full_data"]) params.append(filters["is_full_data"])
if "start_id" in filters:
sql += " AND id > ?"
params.append(filters["start_id"])
if "order_by" in filters:
sql += " order by ? asc"
params.append(filters["order_by"])
if 'limit' in filters: if 'limit' in filters:
sql += " limit ?" sql += " limit ?"
params.append(filters["limit"]) params.append(filters["limit"])
@ -572,7 +578,7 @@ def insert_or_update_movie(movie_data):
studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref']) studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref']) director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
# 导演不存在的话,插入一条 # 导演不存在的话,插入一条
if director_id is None: if (director_id is None) and utils.is_valid_person_url(movie_data['DirectorHref']):
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'], from_movie_list=1) director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'], from_movie_list=1)
if studio_id is None: if studio_id is None:
studio_id = 0 studio_id = 0
@ -605,6 +611,22 @@ def insert_or_update_movie(movie_data):
logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}') logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}')
# 导演-电影写入 关系表
if director_id:
tmp_id = insert_performer_movie(director_id, movie_id, 'directoral', '')
if tmp_id:
logging.debug(f"insert one perfomer_movie. director_id: {director_id}, movie_id:{movie_id}")
for director in movie_data.get('Directors', []):
director_id = get_id_by_href('iafd_performers', director['href'])
# 如果演员不存在,先插入
if (director_id is None) and utils.is_valid_person_url(director['href']):
director_id = insert_performer_index(director['name'], director['href'], from_movie_list=1)
logging.debug(f"insert one director. perfomer_id: {director_id}, movie_id:{movie_id} ")
if director_id:
tmp_id = insert_performer_movie(director_id, movie_id, 'directoral', '')
if tmp_id:
logging.debug(f"insert one perfomer_movie. director_id: {director_id}, movie_id:{movie_id}")
# 插入 performers_movies 关系表 # 插入 performers_movies 关系表
for performer in movie_data.get('Performers', []): for performer in movie_data.get('Performers', []):
performer_id = get_id_by_href('iafd_performers', performer['href']) performer_id = get_id_by_href('iafd_performers', performer['href'])
@ -732,6 +754,12 @@ def query_movie_hrefs(**filters):
if "is_full_data" in filters: if "is_full_data" in filters:
sql += " AND is_full_data = ?" sql += " AND is_full_data = ?"
params.append(filters["is_full_data"]) params.append(filters["is_full_data"])
if "start_id" in filters:
sql += " AND id > ?"
params.append(filters["start_id"])
if "order_by" in filters:
sql += " order by ?"
params.append(filters["order_by"])
if 'limit' in filters: if 'limit' in filters:
sql += " limit ?" sql += " limit ?"
params.append(filters["limit"]) params.append(filters["limit"])
@ -762,7 +790,7 @@ def get_performers_needed_update(limit=None):
return [] return []
# 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。 # 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。
def create_view_and_indexes(): def check_and_create_stat_table(taskid = 0):
try: try:
# 检查索引是否存在,如果不存在则创建 # 检查索引是否存在,如果不存在则创建
indexes = [ indexes = [
@ -782,56 +810,57 @@ def create_view_and_indexes():
logging.info(f"Index {index_name} already exists.") logging.info(f"Index {index_name} already exists.")
# 检查视图是否存在,如果不存在则创建 # 检查视图是否存在,如果不存在则创建
view_name = "view_perfomers_cnt" view_name = f"iafd_tmp_performers_stat_{taskid}"
cursor.execute("SELECT name FROM sqlite_master WHERE type='view' AND name=?", (view_name,)) cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
if not cursor.fetchone(): if cursor.fetchone():
create_view_sql = """ cursor.execute("drop table ?", (view_name,))
CREATE VIEW view_perfomers_cnt AS conn.commit()
create_view_sql = f"""
CREATE table {view_name} AS
SELECT
id,
href,
name,
movies_cnt,
SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count,
SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count
FROM (
SELECT SELECT
id, p.id,
href, p.href,
name, p.name,
movies_cnt, p.movies_cnt,
SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count, COUNT(apm.movie_id) AS movie_count,
SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count 'actor' AS role
FROM ( FROM
SELECT iafd_performers p
p.id, LEFT JOIN
p.href, iafd_performers_movies apm ON p.id = apm.performer_id
p.name,
p.movies_cnt,
COUNT(apm.movie_id) AS movie_count,
'actor' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_performers_movies apm ON p.id = apm.performer_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
UNION ALL
SELECT
p.id,
p.href,
p.name,
p.movies_cnt,
COUNT(im.id) AS movie_count,
'director' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_movies im ON p.id = im.director_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
) combined
GROUP BY GROUP BY
id, href, name, movies_cnt; p.id, p.href, p.name, p.movies_cnt
"""
cursor.execute(create_view_sql) UNION ALL
logging.info(f"View {view_name} created successfully.")
else: SELECT
logging.info(f"View {view_name} already exists.") p.id,
p.href,
p.name,
p.movies_cnt,
COUNT(im.id) AS movie_count,
'director' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_movies im ON p.id = im.director_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
) combined
GROUP BY
id, href, name, movies_cnt;
"""
cursor.execute(create_view_sql)
logging.info(f"table {view_name} created successfully.")
# 提交更改并关闭连接 # 提交更改并关闭连接
conn.commit() conn.commit()
@ -913,7 +942,7 @@ def finalize_task_log(task_id):
logging.error(f"任务 {task_id} 结束失败: {e}") logging.error(f"任务 {task_id} 结束失败: {e}")
if __name__ == "__main__": if __name__ == "__main__":
create_view_and_indexes() check_and_create_stat_table()
''' '''
try: try:

View File

@ -32,6 +32,11 @@ def to_number(value):
except (ValueError, TypeError): except (ValueError, TypeError):
return 0 return 0
def is_valid_person_url(url):
if 'person.rme' in url.lower():
return True
return False
def dist_stu_href_rewrite(href): def dist_stu_href_rewrite(href):
# 提取 ID适用于 distrib 或 studio # 提取 ID适用于 distrib 或 studio
import re import re