modify scripts
This commit is contained in:
@ -262,8 +262,11 @@ def fetch_performers_detail_once(perfomers_list):
|
|||||||
else:
|
else:
|
||||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == 404:
|
||||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
|
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=2)
|
||||||
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||||
|
elif status_code and status_code == 601:
|
||||||
|
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=3)
|
||||||
|
logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
@ -285,8 +288,8 @@ def fetch_performers_detail():
|
|||||||
if debug:
|
if debug:
|
||||||
break
|
break
|
||||||
|
|
||||||
# 获取待更新的演员的列表
|
# 获取待更新的演员的列表,这个对账目前做的还有点问题
|
||||||
while True:
|
while False:
|
||||||
perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
|
perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
|
||||||
if len(perfomers_list) < 1:
|
if len(perfomers_list) < 1:
|
||||||
logging.info(f'all existed performers updated. ')
|
logging.info(f'all existed performers updated. ')
|
||||||
@ -334,8 +337,12 @@ def fetch_movies_detail():
|
|||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == 404:
|
||||||
# 标记为已处理
|
# 标记为已处理
|
||||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
|
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2)
|
||||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
|
elif status_code and status_code == 601:
|
||||||
|
# 标记为已处理
|
||||||
|
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3)
|
||||||
|
logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
@ -394,9 +401,10 @@ def main(cmd, args_debug, args_force):
|
|||||||
db_tools.finalize_task_log(task_id)
|
db_tools.finalize_task_log(task_id)
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1, movies 更新之后,要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取
|
# 1, 演员列表中的影片数量,与电影列表中聚合出来的影片数量,可能不同。一个原因是某个影片有多个导演,且导演又兼了演员。比如:
|
||||||
# 2, distributors 和 studios 对movie列表的互相检验
|
# https://www.iafd.com/title.rme/id=0f79d81f-25ff-40d1-967a-24b99f03b79a
|
||||||
# 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的
|
# https://www.iafd.com/person.rme/id=37efc86d-fefe-436d-8e3e-2e04b4e6565c
|
||||||
|
# 目前的movie表保存导演信息有遗漏。需要调整
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 命令行参数处理
|
# 命令行参数处理
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from bs4 import BeautifulSoup
|
|||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import config
|
import config
|
||||||
|
import utils
|
||||||
|
|
||||||
# 定义基础 URL 和可变参数
|
# 定义基础 URL 和可变参数
|
||||||
host_url = "https://www.iafd.com"
|
host_url = "https://www.iafd.com"
|
||||||
@ -35,13 +36,15 @@ headers = {
|
|||||||
}
|
}
|
||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
save_raw_html = True
|
||||||
|
|
||||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if host_url not in url.lower():
|
if host_url not in url.lower():
|
||||||
logging.error(f'wrong url format: {url}')
|
logging.error(f'wrong url format: {url}')
|
||||||
return None, None
|
return None, 601
|
||||||
|
|
||||||
response = scraper.get(url, headers=headers)
|
response = scraper.get(url, headers=headers)
|
||||||
|
|
||||||
@ -57,6 +60,9 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
logging.debug(f"invalid or outdated page: {url}")
|
logging.debug(f"invalid or outdated page: {url}")
|
||||||
return None, 404 # 直接返回 404,调用方可以跳过
|
return None, 404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
|
if save_raw_html:
|
||||||
|
utils.write_raw_html(url, response.text)
|
||||||
|
|
||||||
# 预处理 HTML(如果提供了 preprocessor)
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||||
|
|
||||||
|
|||||||
@ -253,17 +253,17 @@ def insert_or_update_performer(data):
|
|||||||
|
|
||||||
|
|
||||||
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||||
def insert_or_update_performer_404(name, href):
|
def insert_or_update_performer_404(name, href, is_full_data=1):
|
||||||
try:
|
try:
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
|
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
|
||||||
VALUES (?, ?, 1, datetime('now', 'localtime'))
|
VALUES (?, ?, ?, datetime('now', 'localtime'))
|
||||||
ON CONFLICT(href) DO UPDATE SET
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
name = excluded.name,
|
name = excluded.name,
|
||||||
is_full_data = 1,
|
is_full_data = excluded.is_full_data,
|
||||||
updated_at = datetime('now', 'localtime')
|
updated_at = datetime('now', 'localtime')
|
||||||
""", (
|
""", (
|
||||||
href, name
|
href, name, is_full_data
|
||||||
))
|
))
|
||||||
|
|
||||||
# 获取 performer_id
|
# 获取 performer_id
|
||||||
@ -645,17 +645,17 @@ def insert_or_update_movie(movie_data):
|
|||||||
|
|
||||||
|
|
||||||
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||||
def insert_or_update_movie_404(title, href):
|
def insert_or_update_movie_404(title, href, is_full_data=1):
|
||||||
try:
|
try:
|
||||||
# 插入或更新电影信息
|
# 插入或更新电影信息
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"""
|
"""
|
||||||
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
|
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
|
||||||
VALUES (?, ?, 1, datetime('now', 'localtime'))
|
VALUES (?, ?, ?, datetime('now', 'localtime'))
|
||||||
ON CONFLICT(href) DO UPDATE SET
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
|
title=excluded.title, is_full_data=excluded.is_full_data, updated_at = datetime('now', 'localtime')
|
||||||
""",
|
""",
|
||||||
(title, href)
|
(title, href, is_full_data)
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
@ -761,6 +761,85 @@ def get_performers_needed_update(limit=None):
|
|||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。
|
||||||
|
def create_view_and_indexes():
|
||||||
|
try:
|
||||||
|
# 检查索引是否存在,如果不存在则创建
|
||||||
|
indexes = [
|
||||||
|
("idx_iafd_performers_movies_performer_id",
|
||||||
|
"CREATE INDEX idx_iafd_performers_movies_performer_id ON iafd_performers_movies (performer_id);"),
|
||||||
|
("idx_iafd_movies_director_id",
|
||||||
|
"CREATE INDEX idx_iafd_movies_director_id ON iafd_movies (director_id);"),
|
||||||
|
("idx_iafd_performers_id",
|
||||||
|
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
|
||||||
|
]
|
||||||
|
for index_name, create_index_sql in indexes:
|
||||||
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
|
||||||
|
if not cursor.fetchone():
|
||||||
|
cursor.execute(create_index_sql)
|
||||||
|
logging.info(f"Index {index_name} created successfully.")
|
||||||
|
else:
|
||||||
|
logging.info(f"Index {index_name} already exists.")
|
||||||
|
|
||||||
|
# 检查视图是否存在,如果不存在则创建
|
||||||
|
view_name = "view_perfomers_cnt"
|
||||||
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='view' AND name=?", (view_name,))
|
||||||
|
if not cursor.fetchone():
|
||||||
|
create_view_sql = """
|
||||||
|
CREATE VIEW view_perfomers_cnt AS
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
href,
|
||||||
|
name,
|
||||||
|
movies_cnt,
|
||||||
|
SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count,
|
||||||
|
SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
p.id,
|
||||||
|
p.href,
|
||||||
|
p.name,
|
||||||
|
p.movies_cnt,
|
||||||
|
COUNT(apm.movie_id) AS movie_count,
|
||||||
|
'actor' AS role
|
||||||
|
FROM
|
||||||
|
iafd_performers p
|
||||||
|
LEFT JOIN
|
||||||
|
iafd_performers_movies apm ON p.id = apm.performer_id
|
||||||
|
GROUP BY
|
||||||
|
p.id, p.href, p.name, p.movies_cnt
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
p.id,
|
||||||
|
p.href,
|
||||||
|
p.name,
|
||||||
|
p.movies_cnt,
|
||||||
|
COUNT(im.id) AS movie_count,
|
||||||
|
'director' AS role
|
||||||
|
FROM
|
||||||
|
iafd_performers p
|
||||||
|
LEFT JOIN
|
||||||
|
iafd_movies im ON p.id = im.director_id
|
||||||
|
GROUP BY
|
||||||
|
p.id, p.href, p.name, p.movies_cnt
|
||||||
|
) combined
|
||||||
|
GROUP BY
|
||||||
|
id, href, name, movies_cnt;
|
||||||
|
"""
|
||||||
|
cursor.execute(create_view_sql)
|
||||||
|
logging.info(f"View {view_name} created successfully.")
|
||||||
|
else:
|
||||||
|
logging.info(f"View {view_name} already exists.")
|
||||||
|
|
||||||
|
# 提交更改并关闭连接
|
||||||
|
conn.commit()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.warning(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 插入一条任务日志
|
# 插入一条任务日志
|
||||||
def insert_task_log():
|
def insert_task_log():
|
||||||
try:
|
try:
|
||||||
@ -834,7 +913,9 @@ def finalize_task_log(task_id):
|
|||||||
logging.error(f"任务 {task_id} 结束失败: {e}")
|
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
create_view_and_indexes()
|
||||||
|
|
||||||
|
'''
|
||||||
try:
|
try:
|
||||||
with open('../result/detail.json', 'r') as file:
|
with open('../result/detail.json', 'r') as file:
|
||||||
performers = json.load(file)
|
performers = json.load(file)
|
||||||
@ -845,4 +926,6 @@ if __name__ == "__main__":
|
|||||||
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
|
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
|
||||||
print(query_performer_hrefs())
|
print(query_performer_hrefs())
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
logging.info("detail.json not found, starting fresh.")
|
logging.info("detail.json not found, starting fresh.")
|
||||||
|
|
||||||
|
'''
|
||||||
@ -87,6 +87,30 @@ def write_movie_json(href, data):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error writing file {full_path}: {e}")
|
logging.error(f"Error writing file {full_path}: {e}")
|
||||||
|
|
||||||
|
# 保存抓取到的原始HTML,方便后续核验
|
||||||
|
def write_raw_html(href, html_text):
|
||||||
|
# 获取目录
|
||||||
|
id = extract_id_from_href(href)
|
||||||
|
if 'person.rme' in href.lower():
|
||||||
|
dir_prefix = 'raw_performers'
|
||||||
|
elif 'title.rme' in href.lower():
|
||||||
|
dir_prefix = 'raw_movies'
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||||||
|
file_name = f"{id}.html" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(file_dir, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(full_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(html_text)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||||||
|
except PermissionError:
|
||||||
|
logging.warning(f"错误:没有权限写入文件 {full_path}。")
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"发生未知错误:{e}")
|
||||||
|
|
||||||
# 读取json文件并返回内容
|
# 读取json文件并返回内容
|
||||||
def read_json(file_path):
|
def read_json(file_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user