add some scripts.

This commit is contained in:
2025-03-02 15:27:53 +08:00
parent 6b2e7f5281
commit d522dd9830
17 changed files with 2249514 additions and 0 deletions

View File

@ -0,0 +1,192 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/distrib.rme/distrib="
dist_list_url = f'{base_url}/distrib.asp'
distr_map = {
6812 : 'nubilefilms.com',
8563 : 'teenmegaworld network',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
12454: 'vip4k.com',
13541: 'wow network',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
all_data = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, name):
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="distable")
if not table:
logging.warning(f"Warning: No 'distable' table found in {name}")
return None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
global all_data
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
all_data.append({
'distributors': name,
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return soup
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 获取列表页
def process_list_gage():
global distr_map
logging.info(f"Fetching data for {dist_list_url} ...")
select_element = None
while True:
html = fetch_page(dist_list_url)
if html:
soup = BeautifulSoup(html, "html.parser")
select_element = soup.find('select', {'name': 'Distrib'})
if select_element :
break
else:
logging.info(f"wrong html content. retring {dist_list_url} ...")
else:
logging.info(f"wrong html content. retring {dist_list_url} ...")
if not select_element:
return None
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
distr_map[int(value)] = text
logging.info(f'fetch {dist_list_url} succ. total distributors: {len(distr_map)}')
return True
# 主逻辑函数:循环处理每个种族
def process_main_data():
for dis_key, dis_name in distr_map.items():
url = base_url + str(dis_key)
next_url = url
logging.info(f"Fetching data for {dis_name}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, dis_name)
if soup:
next_url = handle_pagination(soup, dis_name)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data():
with open(f'{res_dir}/distributors.json', 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/distributors.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['distributors', 'title', 'label', 'year', 'rev', 'href'])
writer.writeheader()
writer.writerows(all_data)
# 执行主逻辑
if __name__ == '__main__':
#process_list_gage()
process_main_data()
save_data()
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,101 @@
import sqlite3
import json
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 数据库连接
DB_PATH = 'your_database.db' # 数据库路径,修改为实际路径
# 预定义标签,方便修改
TAG_LIST = ['vixen', 'blacked', 'tushy', 'x-art']
# 预加载标签 ID
def get_all_tag_ids():
try:
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
#cursor.execute("SELECT id, name FROM tags WHERE name IN ('vixen', 'blacked', 'tushy', 'x-art')")
cursor.execute("SELECT id, name FROM tags WHERE name IN ({})".format(', '.join(['?']*len(TAG_LIST))), TAG_LIST)
tags = cursor.fetchall()
# 创建标签名到 tag_id 的映射
return {tag_name.lower(): tag_id for tag_id, tag_name in tags}
except Exception as e:
logger.error(f"Error fetching tag IDs: {e}")
return {}
# 批量查找 performers 的 performer_id
def get_performers_ids(performer_names):
try:
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
query = "SELECT id, name FROM performers WHERE LOWER(name) IN ({})".format(
','.join(['?'] * len(performer_names))
)
cursor.execute(query, [name.lower() for name in performer_names])
performers = cursor.fetchall()
return {performer_name.lower(): performer_id for performer_id, performer_name in performers}
except Exception as e:
logger.error(f"Error fetching performer IDs: {e}")
return {}
# 插入数据到 performers_tags 表
def insert_performer_tag(performer_id, tag_id):
try:
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
# 检查 performers_tags 中是否已有此条数据
cursor.execute("SELECT 1 FROM performers_tags WHERE performer_id = ? AND tag_id = ?", (performer_id, tag_id))
if not cursor.fetchone():
cursor.execute("INSERT INTO performers_tags (performer_id, tag_id) VALUES (?, ?)", (performer_id, tag_id))
conn.commit()
logger.info(f"Inserted performer_id {performer_id} and tag_id {tag_id} into performers_tags.")
else:
logger.info(f"Entry for performer_id {performer_id} and tag_id {tag_id} already exists in performers_tags.")
except Exception as e:
logger.error(f"Error inserting into performers_tags: {e}")
# 处理 detail.json 文件
def process_detail_json(detail_file):
try:
with open(detail_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 获取所有标签的 ID
tag_ids = get_all_tag_ids()
# 收集需要查询的 performers.name
performer_names = [entry.get('person') for entry in data]
# 批量查询 performers.id
performer_ids = get_performers_ids(performer_names)
for entry in data:
person = entry.get('person')
vixen_cnt = entry.get('vixen_cnt', 0)
blacked_cnt = entry.get('blacked_cnt', 0)
tushy_cnt = entry.get('tushy_cnt', 0)
x_art_cnt = entry.get('x_art_cnt', 0)
# 获取 performer_id
performer_id = performer_ids.get(person.lower())
if not performer_id:
continue # 如果找不到 performer_id跳过此条数据
# 处理每个 tagvixen, blacked, tushy, x-art
for tag_name, count in zip(TAG_LIST, [vixen_cnt, blacked_cnt, tushy_cnt, x_art_cnt]):
if count > 0:
tag_id = tag_ids.get(tag_name)
if tag_id:
insert_performer_tag(performer_id, tag_id)
except Exception as e:
logger.error(f"Error processing {detail_file}: {e}")
# 主函数
def main():
detail_file = 'detail.json' # 输入文件路径,可以替换成实际路径
process_detail_json(detail_file)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,72 @@
import json
import csv
# 读取 detail_birth.json 文件
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return []
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return []
# 写入 CSV 文件
def write_to_csv(data, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=[
'person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender',
'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height',
'weight', 'measurements', 'tattoos', 'piercings'
])
writer.writeheader()
for entry in data:
# 确保 performer_aka 始终为列表类型
performer_aka = entry.get('performer_aka', [])
# 如果是 None 或非列表类型,转换为一个空列表
if performer_aka is None:
performer_aka = []
elif not isinstance(performer_aka, list):
performer_aka = [performer_aka]
# 写入每一行
writer.writerow({
'person': entry.get('person', ''),
'href': entry.get('href', ''),
'performer_aka': performer_aka,
'birthday': entry.get('birthday', ''),
'astrology': entry.get('astrology', ''),
'birthplace': entry.get('birthplace', ''),
'gender': entry.get('gender', ''),
'years_active': entry.get('years_active', ''),
'ethnicity': entry.get('ethnicity', ''),
'nationality': entry.get('nationality', ''),
'hair_colors': entry.get('hair_colors', ''),
'eye_color': entry.get('eye_color', ''),
'height': entry.get('height', ''),
'weight': entry.get('weight', ''),
'measurements': entry.get('measurements', ''),
'tattoos': entry.get('tattoos', ''),
'piercings': entry.get('piercings', '')
})
# 主函数,执行转化操作
def main():
# 输入的 JSON 文件路径
input_json_file = 'detail_birth.json'
# 输出的 CSV 文件路径
output_csv_file = 'detail_birth.csv'
# 读取 JSON 文件
data = read_json(input_json_file)
# 将数据写入 CSV 文件
write_to_csv(data, output_csv_file)
print(f"数据已保存到 {output_csv_file}")
if __name__ == "__main__":
main()

34867
scripts/iafd/merge/merged.csv Normal file

File diff suppressed because it is too large Load Diff

139466
scripts/iafd/merge/merged.json Normal file

File diff suppressed because it is too large Load Diff

518899
scripts/iafd/merge/result.json Normal file

File diff suppressed because it is too large Load Diff

21945
scripts/iafd/merge/stashdb.csv Normal file

File diff suppressed because it is too large Load Diff

518859
scripts/iafd/merge/stashdb.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,120 @@
import json
import logging
import cloudscraper
import time
from requests.exceptions import RequestException
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
test_flag = True
# 读取stashdb.json
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
except FileNotFoundError:
logger.error(f"File {file_path} not found.")
return []
except json.JSONDecodeError:
logger.error(f"Error decoding JSON from {file_path}.")
return []
# 请求URL并获取重定向后的URL
def fetch_real_url_2(url, scraper):
try:
response = scraper.get(url, allow_redirects=True)
if response.status_code == 200:
return response.url # 获取最终的URL
else:
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
except RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def fetch_real_url(url, scraper):
try:
# 请求URL禁止自动重定向
response = scraper.get(url, allow_redirects=False)
# 检查是否是302响应并获取Location头部的URL
if response.status_code == 302 or response.status_code == 301:
redirect_url = response.headers.get("Location")
if redirect_url:
logger.info(f"Redirected to: {redirect_url}")
return redirect_url
else:
logger.warning(f"Redirect response received, but no Location header found for {url}")
return None
else:
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
except RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
# 处理每个 URL
def process_urls(data, scraper):
loop = 0
global test_flag
for entry in data:
iafd_urls = entry.get('iafd_urls', [])
real_urls = []
for url in iafd_urls:
if 'perfid=' in url:
# 如果是重定向链接访问并获取重定向后的URL
real_url = fetch_real_url(url, scraper)
if real_url:
real_urls.append(real_url)
# 测试时,执行小批量数据
loop = loop + 1
if test_flag and loop >10:
return data
elif 'person.rme/id=' in url:
# 非perfid链接直接添加
real_urls.append(url)
else:
# 非perfid链接直接添加
real_urls.append(url)
logger.warning(f"unkown url format: {url}")
# 更新iafd_real_url字段
entry['iafd_real_url'] = real_urls
return data
# 保存处理后的结果到 result.json
def save_to_json(data, output_file):
try:
with open(output_file, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
logger.info(f"Data saved to {output_file}")
except Exception as e:
logger.error(f"Error saving to {output_file}: {e}")
# 主函数
def main():
# 读取输入文件
input_file = 'stashdb.json'
output_file = 'result.json'
# 创建cloudscraper对象
scraper = cloudscraper.create_scraper()
# 读取stashdb.json中的数据
data = read_json(input_file)
# 处理每个 URL获取重定向后的URL
processed_data = process_urls(data, scraper)
# 保存结果到 result.json
save_to_json(processed_data, output_file)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,254 @@
import os
import json
import csv
import time
import logging
import sys
import signal
import re
import cloudscraper
from bs4 import BeautifulSoup
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
# 目录和文件路径
RESULT_DIR = "result"
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
OUTPUT_JSON = os.path.join(RESULT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(RESULT_DIR, "movie_details.csv")
BATCH_SIZE = 100 # 每100条数据写入文件
# 初始化 Cloudflare 绕过工具
scraper = cloudscraper.create_scraper()
def load_existing_data():
"""加载已处理的数据,支持续传"""
if os.path.exists(OUTPUT_JSON):
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
try:
return json.load(f)
except json.JSONDecodeError:
return []
return []
def save_data(all_movies):
"""保存数据到 JSON 和 CSV 文件"""
logging.info("Saving data...")
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(all_movies, f, indent=4, ensure_ascii=False)
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
"AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
for movie in all_movies:
writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
def fetch_html(href):
"""请求网页并返回 HTML 内容"""
for attempt in range(3):
try:
response = scraper.get(href, timeout=10)
if response.status_code == 200:
return response.text
except Exception as e:
logging.warning(f"Error fetching {href}: {e}")
time.sleep(2)
logging.error(f"Failed to fetch {href} after 3 attempts")
return None
def parse_movie_details(html, href, title):
"""解析网页 HTML 并提取电影信息"""
soup = BeautifulSoup(html, "html.parser")
# 解析电影基础信息
movie_data = {}
director_href = ''
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
if key == 'Director':
director_href = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
performer["tags"] = [
tag.strip() for br in cast.find_all("br")
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
]
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip()
scene_performers = [p.strip() for p in cols[1].text.split(",")]
scene_breakdowns.append({"scene": scene, "performers": scene_performers})
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": director_href,
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
def process_movies():
"""处理电影数据"""
all_movies = load_existing_data()
processed_hrefs = {movie["href"] for movie in all_movies}
# 读取 distributors.json 文件
with open(INPUT_FILE, "r", encoding="utf-8") as f:
movies = json.load(f)
new_movies = []
count = 0
for entry in movies:
href = entry["href"]
title = entry["title"]
if href in processed_hrefs:
continue # 跳过已处理数据
logging.info(f"Processing: {title} ({href})")
html = fetch_html(href)
if not html:
continue # 获取失败,跳过
movie = parse_movie_details(html, href, title)
new_movies.append(movie)
count += 1
# 每 BATCH_SIZE 条数据刷新一次文件
if count % BATCH_SIZE == 0:
save_data(all_movies + new_movies)
# 最终保存文件
all_movies.extend(new_movies)
save_data(all_movies)
logging.info("Task completed.")
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
movie = {}
while True:
html = fetch_html(href)
if not html:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
movie = parse_movie_details(html, href, 'title')
if movie:
break
else:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
id = extract_id_from_href(href)
filename = f"{id}.json" # 用 - 替换空格
try:
with open(filename, 'w', encoding='utf-8') as json_file:
json.dump(movie, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {filename}: {e}")
print(f'fetch succ. saved result in {filename}')
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
save_data()
sys.exit(0)
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
process_movies()
finally:
# 清理操作,保证在程序正常退出时执行
save_data()
logging.info("Data processing completed.")
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,191 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/studio.rme/studio="
list_page_url = f'{base_url}/studio.asp'
studio_map = {
6812 : 'nubilefilms.com',
9811 : 'Teen Mega World',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
8052: 'wowgirls.com',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
all_data = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, name):
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="studio")
if not table:
logging.warning(f"Warning: No 'studio' table found in {name}")
return None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
global all_data
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
all_data.append({
'studios': name,
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return soup
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 获取列表页
def process_list_gage():
global studio_map
logging.info(f"Fetching data for {list_page_url} ...")
select_element = None
while True:
html = fetch_page(list_page_url)
if html:
soup = BeautifulSoup(html, "html.parser")
select_element = soup.find('select', {'name': 'Studio'})
if select_element :
break
else:
logging.info(f"wrong html content. retring {list_page_url} ...")
else:
logging.info(f"wrong html content. retring {list_page_url} ...")
if not select_element:
return None
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
studio_map[int(value)] = text
logging.info(f'fetch {list_page_url} succ. total distributors: {len(studio_map)}')
return True
# 主逻辑函数:循环处理每个种族
def process_main_data():
for key, name in studio_map.items():
url = base_url + str(key)
next_url = url
logging.info(f"Fetching data for {name}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, name)
if soup:
next_url = handle_pagination(soup, name)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data():
with open(f'{res_dir}/studios.json', 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/studios.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['studios', 'title', 'label', 'year', 'rev', 'href'])
writer.writeheader()
writer.writerows(all_data)
# 执行主逻辑
if __name__ == '__main__':
#process_list_gage()
process_main_data()
save_data()
logging.info("Data fetching and saving completed.")