modify scripts
This commit is contained in:
334
iafd/src_json/movie_detail_fetch.py
Normal file
334
iafd/src_json/movie_detail_fetch.py
Normal file
@ -0,0 +1,334 @@
|
||||
import os
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
import logging
|
||||
import sys
|
||||
import signal
|
||||
import re
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
|
||||
# 目录和文件路径
|
||||
RESULT_DIR = "../result"
|
||||
OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
|
||||
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
|
||||
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
|
||||
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
|
||||
BATCH_SIZE = 100 # 每100条数据写入文件
|
||||
movies_dir = f'{RESULT_DIR}/movies'
|
||||
|
||||
# 初始化 Cloudflare 绕过工具
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 全量数据
|
||||
all_movies = []
|
||||
|
||||
def load_existing_data():
|
||||
"""加载已处理的数据,支持续传"""
|
||||
if os.path.exists(OUTPUT_JSON):
|
||||
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
|
||||
try:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def save_data():
|
||||
"""保存数据到 JSON 和 CSV 文件"""
|
||||
logging.info("Saving data...")
|
||||
global all_movies
|
||||
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(all_movies, f, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
|
||||
"AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
|
||||
for movie in all_movies:
|
||||
writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
|
||||
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
|
||||
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
|
||||
|
||||
# 请求网页并返回 HTML 内容
|
||||
def fetch_html(href):
|
||||
"""请求网页并返回 HTML 内容"""
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = scraper.get(href, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.warning(f"Error fetching {href}: {e}")
|
||||
time.sleep(2)
|
||||
|
||||
logging.error(f"Failed to fetch {href} after 3 attempts")
|
||||
return None
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_movie_details(html, href, title):
|
||||
"""解析网页 HTML 并提取电影信息"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# 解析电影基础信息
|
||||
movie_data = {}
|
||||
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||
if info_div:
|
||||
labels = info_div.find_all("p", class_="bioheading")
|
||||
values = info_div.find_all("p", class_="biodata")
|
||||
for label, value in zip(labels, values):
|
||||
key = label.text.strip()
|
||||
val = value.text.strip()
|
||||
if key in ["Distributor", "Studio", "Director"]:
|
||||
link = value.find("a")
|
||||
if link:
|
||||
val = link.text.strip()
|
||||
movie_data[f'{key}Href'] = host_url + link['href']
|
||||
movie_data[key] = val
|
||||
else:
|
||||
return None
|
||||
|
||||
# 解析演职人员信息
|
||||
performers = []
|
||||
cast_divs = soup.find_all("div", class_="castbox")
|
||||
for cast in cast_divs:
|
||||
performer = {}
|
||||
link = cast.find("a")
|
||||
if link:
|
||||
performer["name"] = link.text.strip()
|
||||
performer["href"] = host_url + link["href"]
|
||||
|
||||
performer["tags"] = [
|
||||
tag.strip() for br in cast.find_all("br")
|
||||
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||
]
|
||||
|
||||
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||
performers.append(performer)
|
||||
|
||||
# 解析场景拆解
|
||||
scene_breakdowns = []
|
||||
scene_table = soup.find("div", id="sceneinfo")
|
||||
if scene_table:
|
||||
rows = scene_table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) >= 2:
|
||||
scene = cols[0].text.strip() # 场景编号
|
||||
performer_info = cols[1] # 包含表演者及链接信息
|
||||
|
||||
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||
performer_html = str(performer_info) # 获取所有HTML内容
|
||||
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||
|
||||
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||
performers_text = performers_soup.get_text()
|
||||
|
||||
# 提取表演者
|
||||
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||
|
||||
# 尝试获取 `webscene` 和 `studio`
|
||||
links_data = {}
|
||||
links = performer_info.find_all("a")
|
||||
if links:
|
||||
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||
webscene = links[0]["href"] if len(links)>0 else None
|
||||
studio = links[1].text.strip() if len(links)>1 else None
|
||||
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||
links_data = {
|
||||
"title": webscene_title,
|
||||
"webscene": webscene,
|
||||
"studio": studio,
|
||||
"studio_lnk": studio_lnk,
|
||||
}
|
||||
|
||||
scene_data = {
|
||||
"scene": scene,
|
||||
"performers": scene_performers,
|
||||
**links_data,
|
||||
}
|
||||
scene_breakdowns.append(scene_data)
|
||||
|
||||
appears_in = []
|
||||
appears_divs = soup.find("div", id="appearssection")
|
||||
if appears_divs:
|
||||
rows = appears_divs.find_all("li")
|
||||
for row in rows:
|
||||
lnk = row.find("a")
|
||||
if lnk:
|
||||
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||
|
||||
|
||||
return {
|
||||
"href": href,
|
||||
"title": title,
|
||||
"Minutes": movie_data.get("Minutes", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||
"All-Girl": movie_data.get("All-Girl", ""),
|
||||
"All-Male": movie_data.get("All-Male", ""),
|
||||
"Compilation": movie_data.get("Compilation", ""),
|
||||
"Webscene": movie_data.get("Webscene", ""),
|
||||
"Director": movie_data.get("Director", ""),
|
||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"StudioHref": movie_data.get("StudioHref", ""),
|
||||
"Performers": performers,
|
||||
"SceneBreakdowns": scene_breakdowns,
|
||||
"AppearsIn": appears_in,
|
||||
}
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
sub_dir = str[:1].lower()
|
||||
full_path = os.path.join(base_dir, sub_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_movie_json(href, data):
|
||||
# 获取目录
|
||||
movie_id = extract_id_from_href(href)
|
||||
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
def process_movies():
|
||||
"""处理电影数据"""
|
||||
global all_movies
|
||||
all_movies = load_existing_data()
|
||||
processed_hrefs = {movie["href"] for movie in all_movies}
|
||||
|
||||
# 读取 distributors.json 文件
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
movies = json.load(f)
|
||||
|
||||
count = 0
|
||||
|
||||
for entry in movies:
|
||||
href = entry["href"]
|
||||
title = entry["title"]
|
||||
|
||||
if href in processed_hrefs:
|
||||
logging.info(f"Skiping existed: {title} ({href})")
|
||||
continue # 跳过已处理数据
|
||||
|
||||
logging.info(f"Processing: {title} ({href})")
|
||||
|
||||
while True:
|
||||
html = fetch_html(href)
|
||||
if not html:
|
||||
logging.warning(f'Retring {title} ({href}) ')
|
||||
continue # 获取失败,跳过
|
||||
else:
|
||||
movie = parse_movie_details(html, href, title)
|
||||
if not movie:
|
||||
logging.warning(f'Retring {title} ({href}) ')
|
||||
continue
|
||||
else:
|
||||
all_movies.append(movie)
|
||||
count += 1
|
||||
|
||||
# 写入本地文件
|
||||
write_movie_json(href, movie)
|
||||
break
|
||||
|
||||
# 每 BATCH_SIZE 条数据刷新一次文件
|
||||
if count % BATCH_SIZE == 0:
|
||||
save_data()
|
||||
|
||||
# 最终保存文件
|
||||
save_data()
|
||||
|
||||
logging.info("Task completed.")
|
||||
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 指定url访问
|
||||
def process_one(href):
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
# 获取并解析数据
|
||||
movie = {}
|
||||
while True:
|
||||
html = fetch_html(href)
|
||||
if not html:
|
||||
logging.warning(f'fetching {href} error. retrying...')
|
||||
continue # 获取失败,跳过
|
||||
|
||||
movie = parse_movie_details(html, href, 'title')
|
||||
if movie:
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetching {href} error. retrying...')
|
||||
continue # 获取失败,跳过
|
||||
|
||||
if movie:
|
||||
write_movie_json(href, movie)
|
||||
|
||||
print(f'fetch succ. saved result in {movies_dir}')
|
||||
|
||||
# 处理程序被终止时的数据
|
||||
def handle_exit_signal(signal, frame):
|
||||
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
|
||||
save_data()
|
||||
sys.exit(0)
|
||||
|
||||
# 全量访问
|
||||
def main():
|
||||
try:
|
||||
# 注册退出信号
|
||||
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
|
||||
process_movies()
|
||||
finally:
|
||||
# 清理操作,保证在程序正常退出时执行
|
||||
save_data()
|
||||
logging.info("Data processing completed.")
|
||||
|
||||
# 程序入口,读取参数
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
url = sys.argv[1]
|
||||
process_one(url)
|
||||
else:
|
||||
main()
|
||||
Reference in New Issue
Block a user