modify scripts

This commit is contained in:
oscarz
2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions

101
iafd/merge/auto_tag.py Normal file
View File

@ -0,0 +1,101 @@
import sqlite3
import json
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 数据库连接
DB_PATH = 'your_database.db' # 数据库路径,修改为实际路径
# 预定义标签,方便修改
TAG_LIST = ['vixen', 'blacked', 'tushy', 'x-art']
# 预加载标签 ID
def get_all_tag_ids():
try:
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
#cursor.execute("SELECT id, name FROM tags WHERE name IN ('vixen', 'blacked', 'tushy', 'x-art')")
cursor.execute("SELECT id, name FROM tags WHERE name IN ({})".format(', '.join(['?']*len(TAG_LIST))), TAG_LIST)
tags = cursor.fetchall()
# 创建标签名到 tag_id 的映射
return {tag_name.lower(): tag_id for tag_id, tag_name in tags}
except Exception as e:
logger.error(f"Error fetching tag IDs: {e}")
return {}
# 批量查找 performers 的 performer_id
def get_performers_ids(performer_names):
try:
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
query = "SELECT id, name FROM performers WHERE LOWER(name) IN ({})".format(
','.join(['?'] * len(performer_names))
)
cursor.execute(query, [name.lower() for name in performer_names])
performers = cursor.fetchall()
return {performer_name.lower(): performer_id for performer_id, performer_name in performers}
except Exception as e:
logger.error(f"Error fetching performer IDs: {e}")
return {}
# 插入数据到 performers_tags 表
def insert_performer_tag(performer_id, tag_id):
try:
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
# 检查 performers_tags 中是否已有此条数据
cursor.execute("SELECT 1 FROM performers_tags WHERE performer_id = ? AND tag_id = ?", (performer_id, tag_id))
if not cursor.fetchone():
cursor.execute("INSERT INTO performers_tags (performer_id, tag_id) VALUES (?, ?)", (performer_id, tag_id))
conn.commit()
logger.info(f"Inserted performer_id {performer_id} and tag_id {tag_id} into performers_tags.")
else:
logger.info(f"Entry for performer_id {performer_id} and tag_id {tag_id} already exists in performers_tags.")
except Exception as e:
logger.error(f"Error inserting into performers_tags: {e}")
# 处理 detail.json 文件
def process_detail_json(detail_file):
try:
with open(detail_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 获取所有标签的 ID
tag_ids = get_all_tag_ids()
# 收集需要查询的 performers.name
performer_names = [entry.get('person') for entry in data]
# 批量查询 performers.id
performer_ids = get_performers_ids(performer_names)
for entry in data:
person = entry.get('person')
vixen_cnt = entry.get('vixen_cnt', 0)
blacked_cnt = entry.get('blacked_cnt', 0)
tushy_cnt = entry.get('tushy_cnt', 0)
x_art_cnt = entry.get('x_art_cnt', 0)
# 获取 performer_id
performer_id = performer_ids.get(person.lower())
if not performer_id:
continue # 如果找不到 performer_id跳过此条数据
# 处理每个 tagvixen, blacked, tushy, x-art
for tag_name, count in zip(TAG_LIST, [vixen_cnt, blacked_cnt, tushy_cnt, x_art_cnt]):
if count > 0:
tag_id = tag_ids.get(tag_name)
if tag_id:
insert_performer_tag(performer_id, tag_id)
except Exception as e:
logger.error(f"Error processing {detail_file}: {e}")
# 主函数
def main():
detail_file = 'detail.json' # 输入文件路径,可以替换成实际路径
process_detail_json(detail_file)
if __name__ == "__main__":
main()

72
iafd/merge/json2csv.py Normal file
View File

@ -0,0 +1,72 @@
import json
import csv
# 读取 detail_birth.json 文件
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return []
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return []
# 写入 CSV 文件
def write_to_csv(data, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=[
'person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender',
'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height',
'weight', 'measurements', 'tattoos', 'piercings'
])
writer.writeheader()
for entry in data:
# 确保 performer_aka 始终为列表类型
performer_aka = entry.get('performer_aka', [])
# 如果是 None 或非列表类型,转换为一个空列表
if performer_aka is None:
performer_aka = []
elif not isinstance(performer_aka, list):
performer_aka = [performer_aka]
# 写入每一行
writer.writerow({
'person': entry.get('person', ''),
'href': entry.get('href', ''),
'performer_aka': performer_aka,
'birthday': entry.get('birthday', ''),
'astrology': entry.get('astrology', ''),
'birthplace': entry.get('birthplace', ''),
'gender': entry.get('gender', ''),
'years_active': entry.get('years_active', ''),
'ethnicity': entry.get('ethnicity', ''),
'nationality': entry.get('nationality', ''),
'hair_colors': entry.get('hair_colors', ''),
'eye_color': entry.get('eye_color', ''),
'height': entry.get('height', ''),
'weight': entry.get('weight', ''),
'measurements': entry.get('measurements', ''),
'tattoos': entry.get('tattoos', ''),
'piercings': entry.get('piercings', '')
})
# 主函数,执行转化操作
def main():
# 输入的 JSON 文件路径
input_json_file = 'detail_birth.json'
# 输出的 CSV 文件路径
output_csv_file = 'detail_birth.csv'
# 读取 JSON 文件
data = read_json(input_json_file)
# 将数据写入 CSV 文件
write_to_csv(data, output_csv_file)
print(f"数据已保存到 {output_csv_file}")
if __name__ == "__main__":
main()

120
iafd/merge/url_match.py Normal file
View File

@ -0,0 +1,120 @@
import json
import logging
import cloudscraper
import time
from requests.exceptions import RequestException
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
test_flag = True
# 读取stashdb.json
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
except FileNotFoundError:
logger.error(f"File {file_path} not found.")
return []
except json.JSONDecodeError:
logger.error(f"Error decoding JSON from {file_path}.")
return []
# 请求URL并获取重定向后的URL
def fetch_real_url_2(url, scraper):
try:
response = scraper.get(url, allow_redirects=True)
if response.status_code == 200:
return response.url # 获取最终的URL
else:
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
except RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def fetch_real_url(url, scraper):
try:
# 请求URL禁止自动重定向
response = scraper.get(url, allow_redirects=False)
# 检查是否是302响应并获取Location头部的URL
if response.status_code == 302 or response.status_code == 301:
redirect_url = response.headers.get("Location")
if redirect_url:
logger.info(f"Redirected to: {redirect_url}")
return redirect_url
else:
logger.warning(f"Redirect response received, but no Location header found for {url}")
return None
else:
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
except RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
# 处理每个 URL
def process_urls(data, scraper):
loop = 0
global test_flag
for entry in data:
iafd_urls = entry.get('iafd_urls', [])
real_urls = []
for url in iafd_urls:
if 'perfid=' in url:
# 如果是重定向链接访问并获取重定向后的URL
real_url = fetch_real_url(url, scraper)
if real_url:
real_urls.append(real_url)
# 测试时,执行小批量数据
loop = loop + 1
if test_flag and loop >10:
return data
elif 'person.rme/id=' in url:
# 非perfid链接直接添加
real_urls.append(url)
else:
# 非perfid链接直接添加
real_urls.append(url)
logger.warning(f"unkown url format: {url}")
# 更新iafd_real_url字段
entry['iafd_real_url'] = real_urls
return data
# 保存处理后的结果到 result.json
def save_to_json(data, output_file):
try:
with open(output_file, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
logger.info(f"Data saved to {output_file}")
except Exception as e:
logger.error(f"Error saving to {output_file}: {e}")
# 主函数
def main():
# 读取输入文件
input_file = 'stashdb.json'
output_file = 'result.json'
# 创建cloudscraper对象
scraper = cloudscraper.create_scraper()
# 读取stashdb.json中的数据
data = read_json(input_file)
# 处理每个 URL获取重定向后的URL
processed_data = process_urls(data, scraper)
# 保存结果到 result.json
save_to_json(processed_data, output_file)
if __name__ == "__main__":
main()

87
iafd/src/config.py Normal file
View File

@ -0,0 +1,87 @@
import logging
import os
import inspect
import time
from datetime import datetime
from logging.handlers import RotatingFileHandler
from collections import defaultdict
home_dir = os.path.expanduser("~")
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
global_share_data_dir = f'{home_dir}/sharedata'
# 统计日志频率
log_count = defaultdict(int) # 记录日志的次数
last_log_time = defaultdict(float) # 记录上次写入的时间戳
class RateLimitFilter(logging.Filter):
"""
频率限制过滤器:
1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
2. 如果日志速率超过 100 条/秒,发出告警
"""
LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
def filter(self, record):
global log_count, last_log_time
message_key = record.getMessage() # 获取日志内容
# 计算当前时间
now = time.time()
elapsed = now - last_log_time[message_key]
# 限制相同日志的写入频率
if elapsed < 60: # 60 秒内
log_count[message_key] += 1
if log_count[message_key] > self.LOG_LIMIT:
print('reach limit.')
return False # 直接丢弃
else:
log_count[message_key] = 1 # 超过 60 秒,重新计数
last_log_time[message_key] = now
return True # 允许写入日志
def setup_logging(log_filename=None):
if log_filename is None:
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
current_date = datetime.now().strftime('%Y%m%d')
log_filename = f'../log/{caller_filename}_{current_date}.log'
max_log_size = 100 * 1024 * 1024 # 10 MB
max_log_files = 10 # 最多保留 10 个日志文件
file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
))
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter(
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
))
# 创建 logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [] # 避免重复添加 handler
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# 添加频率限制
rate_limit_filter = RateLimitFilter()
file_handler.addFilter(rate_limit_filter)
console_handler.addFilter(rate_limit_filter)
# 运行示例
if __name__ == "__main__":
setup_logging()
for i in range(1000):
logging.info("测试日志,检测频率限制")
time.sleep(0.01) # 模拟快速写入日志

411
iafd/src/fetch.py Normal file
View File

@ -0,0 +1,411 @@
import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import iafd_scraper as scraper
import utils
config.setup_logging()
debug = False
force = False
# 按星座获取演员列表,无翻页
def fetch_performers_by_astro():
for astro in scraper.astro_list:
url = scraper.astr_base_url + astro
logging.info(f"Fetching data for {astro}, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_astro(soup, astro)
if list_data:
for row in list_data :
# 写入演员数据表
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
if perfomer_id:
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
else:
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 调试添加break
if debug:
break
# 按生日获取演员列表,无翻页
def fetch_performers_by_birth():
for month in range(1, 13): # 遍历1到12月
for day in range(1, 32): # 遍历1到31天
url = scraper.birth_base_url.format(month=month, day=day)
logging.info(f"Fetching data for birth, url {url}")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_page_birth(soup, month, day)
if list_data:
for row in list_data :
# 写入演员数据表
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
if perfomer_id:
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
else:
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 调试添加break
if debug:
return True
# 更新人种列表
def fetch_ethic_list():
url = scraper.ethnic_list_url
logging.info(f"Fetching data for performer's ethnic list, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="ethnicity1", attr_type="id"))
if soup:
list_data = scraper.parse_page_ethnic_list(soup, url)
if list_data:
for row in list_data :
dist_id = db_tools.insert_or_update_ethnic({'name': row['name'], 'href': row.get('href', '')})
if dist_id:
logging.debug(f'insert one record into ethnic table. id:{dist_id}, name: {row['name']}, href:{row.get('href', '')}')
else:
logging.warning(f'fetch ethnic error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch page error. {url} ...')
# 按人种获取演员列表,有翻页
def fetch_performers_by_ethnic():
# 先刷新列表
fetch_ethic_list()
ethnic_list = db_tools.query_ethnic_hrefs()
for row in ethnic_list:
url = row['href']
ethnic = row['name']
next_url = url
while next_url:
logging.info(f"Fetching data for {ethnic}, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
parser="lxml", preprocessor=scraper.preprocess_html)
if soup:
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
if list_data:
for row in list_data :
# 写入演员数据表
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)
if perfomer_id:
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
else:
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
break
else:
logging.warning(f'fetch astro error. {url} ...')
# 调试添加break
if debug:
return True
# 获取distributors列表
def fetch_distributors_list():
url = scraper.distributors_list_url
logging.info(f"Fetching data for distributors list, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
if list_data:
for row in list_data :
dis_url = scraper.distributors_base_url + row['href']
dist_id = db_tools.insert_or_update_distributor({'name': row['name'], 'href': dis_url})
if dist_id:
logging.debug(f'insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}')
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 获取studios列表
def fetch_studios_list():
url = scraper.studios_list_url
logging.info(f"Fetching data for studios list, url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
if list_data:
for row in list_data :
stu_url = scraper.studios_base_url + row['href']
stu_id = db_tools.insert_or_update_studio({'name': row['name'], 'href': stu_url})
if stu_id:
logging.debug(f'insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}')
else:
logging.warning(f'fetch astro error. {url} ...')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 更新distributors列表中的影片信息
def fetch_movies_by_dist():
# 先刷新一下列表
fetch_distributors_list()
url_list = db_tools.query_studio_hrefs()
if debug:
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
for url in url_list:
logging.info(f"Fetching data for distributor url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetching page error. {url}')
# 调试增加brak
if debug:
break
# 更新distributors列表中的影片信息
def fetch_movies_by_stu():
# 先刷新一下列表
fetch_studios_list()
url_list = db_tools.query_studio_hrefs()
if debug:
url_list = db_tools.query_studio_hrefs(name='vixen.com')
for url in url_list:
logging.info(f"Fetching data for studio url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetching page error. {url}')
# 调试增加brak
if debug:
break
# 更新演员信息,单次循环
def fetch_performers_detail_once(perfomers_list):
last_performer_id = 0
for performer in perfomers_list:
url = performer['href']
person = performer['name']
logging.info(f"Fetching data for performer ({person}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
if soup:
data = scraper.parse_page_performer(soup)
if data:
performer_id = db_tools.insert_or_update_performer({
'href': url,
'person': person,
**data
})
if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_performer_id = performer_id
else:
logging.warning(f'insert person: ({person}) {url} failed.')
# 写入到本地json文件
utils.write_person_json(person, url, {
'href': url,
'person': person,
**data
})
else:
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
elif status_code and status_code == 404:
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
time.sleep(1)
return last_performer_id
# 更新演员信息
def fetch_performers_detail():
limit_count = 5 if debug else 100
perfomers_list = []
# 获取新演员的列表
while True:
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
if len(perfomers_list) < 1:
logging.info(f'all new performers fetched. ')
break
last_perfomer_id = fetch_performers_detail_once(perfomers_list)
logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
if debug:
break
# 获取待更新的演员的列表
while True:
perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
if len(perfomers_list) < 1:
logging.info(f'all existed performers updated. ')
break
last_perfomer_id = fetch_performers_detail_once(perfomers_list)
logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
if debug:
break
# 更新影片信息
def fetch_movies_detail():
limit_count = 10 if debug else 100
movies_list = []
while True:
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
if len(movies_list) < 1:
logging.info(f'all movies fetched.')
break
last_movie_id = 0
succ_count = 0
for movie in movies_list:
url = movie['href']
title = movie['title']
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
if soup:
movie_data = scraper.parse_page_movie(soup, url, title)
if movie_data :
# 修复url不规范的问题
if movie_data['DistributorHref']:
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
if movie_data['StudioHref']:
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
last_movie_id = movie_id
succ_count += 1
else:
logging.warning(f'insert movie {url} failed.')
# 写入到本地json文件
utils.write_movie_json(url, movie_data)
else:
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == 404:
# 标记为已处理
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. url: {url}')
time.sleep(1)
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
# 调试增加break
if debug:
return True
# 建立缩写到函数的映射
function_map = {
"astro": fetch_performers_by_astro,
"birth": fetch_performers_by_birth,
"ethnic": fetch_performers_by_ethnic,
"dist" : fetch_movies_by_dist,
"stu" : fetch_movies_by_stu,
"performers": fetch_performers_detail,
"movies" : fetch_movies_detail,
}
# 主函数
def main(cmd, args_debug, args_force):
global debug
debug = args_debug
global force
force = args_force
# 开启任务
task_id = db_tools.insert_task_log()
if task_id is None:
logging.warning(f'insert task log error.')
return None
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {func}')
func()
else:
print(f"Warning: {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
db_tools.update_task_log(task_id, task_status=f'Running {func}')
func()
else:
print(f"Warning: {name} is not a valid function shortcut.")
logging.info(f'all process completed!')
db_tools.finalize_task_log(task_id)
# TODO:
# 1, movies 更新之后,要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取
# 2, distributors 和 studios 对movie列表的互相检验
# 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
parser = argparse.ArgumentParser(description='fetch iafd data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
args = parser.parse_args()
main(args.cmd, args.debug, args.force)

562
iafd/src/iafd_scraper.py Normal file
View File

@ -0,0 +1,562 @@
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
distributors_list_url = f'{host_url}/distrib.asp'
distributors_base_url = f"{host_url}/distrib.rme/distrib="
studios_list_url = f"{host_url}/studio.asp"
studios_base_url = f"{host_url}/studio.rme/studio="
ethnic_list_url = f'{host_url}/advsearch.asp'
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
for attempt in range(max_retries):
try:
if host_url not in url.lower():
logging.error(f'wrong url format: {url}')
return None, None
response = scraper.get(url, headers=headers)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.debug(f"Page not found (404): {url}")
return None, 404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 过期的网页与404相同处理
if "invalid or outdated page" in response.text.lower():
logging.debug(f"invalid or outdated page: {url}")
return None, 404 # 直接返回 404调用方可以跳过
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 检查电影信息是否存在
def movie_validator(soup, table_id):
return soup.find("table", id=table_id) is not None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic_list(soup, href):
div_root = soup.find("select", id="ethnicity1")
if not div_root:
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
return None, None
list_data = []
# 提取所有的 <option> 标签
options = div_root.find_all('option')
if options:
# 解析并输出 value 和文本内容
for option in options:
href = option.get('value', None)
text = option.text.strip()
if href and href.lower() == 'none':
continue
list_data.append({
"name": text,
"href": host_url + href if href else ''
})
return list_data
# 解析 HTML 内容,提取需要的数据
def parse_page_astro(soup, astro):
astro_div = soup.find("div", id="astro")
if not astro_div:
logging.warning(f"Warning: No 'astro' div found in {astro}")
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = host_url + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
list_data.append({
"astrology": astro,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
flag = True
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析页面内容并更新birth_map
def parse_page_birth(soup, month, day):
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
if not datarows:
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
rows = datarows[0].find_all('div', class_='col-sm-4')
for row in rows:
link_tag = row.find('a')
person = link_tag.text.strip() if link_tag else ''
href = link_tag['href'] if link_tag else ''
href = host_url + href
# 如果 href 已经在 birth_map 中,跳过
flag = True
if any(entry['href'] == href for entry in list_data):
continue
# 将数据添加到 birth_map
list_data.append({
'month': month,
'day': day,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic(soup, ethnic):
rows = soup.find_all('div', class_='row headshotrow')
flag = False
list_data = []
next_url = None
for row in rows:
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
link_tag = col.find('a')
img_tag = col.find('div', class_='pictag')
flag = True
if link_tag and img_tag:
href = host_url + link_tag['href']
person = img_tag.text.strip()
# 将数据存储到 ethnic_map
list_data.append({
'ethnic': ethnic,
'person': person,
'href': href
})
if flag:
logging.debug(f"get {len(list_data)} persons from this page.")
next_page = soup.find('a', rel='next')
if next_page:
next_url = host_url + next_page['href']
logging.debug(f"Found next page: {next_url}")
return list_data, next_url
else:
logging.debug(f"All pages fetched for {ethnic}.")
return list_data, None
else:
return None, None
# 解析列表页
def parse_page_dist_stu_list(soup, select_name):
list_data = []
next_url = None
select_element = soup.find('select', {'name': select_name})
if select_element :
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
list_data.append({
'name' : text,
'href' : str(value)
})
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_dist_stu(soup, table_id):
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No {table_id} table found ")
return None, None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
list_data = []
next_url = None
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
list_data.append({
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return list_data, next_url
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
href_a = cols[0].find('a')
href = href_a['href'] if href_a else ''
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
href_d = cols[2].find('a')
href_dist = host_url + href_d['href'] if href_d else ''
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'href' : href,
'year': year,
'distributor': distributor,
'distributor_href': href_dist,
'notes': notes,
'rev': rev,
'formats': formats,
'tr_class': tr_class
})
return movies, distributor_count
# 请求网页并提取所需数据
def parse_page_performer(soup):
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
data['credits'] = credits_list
return data
# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
performer["tags"] = [
tag.strip() for br in cast.find_all("br")
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
]
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
if __name__ == "__main__":
for astro in astro_list:
url = astr_base_url + astro
next_url = url
logging.info(f"Fetching data for {astro}, url {url} ...")
while True:
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = parse_page_astro(soup, astro)
if list_data:
print(list_data[0] if len(list_data)>0 else 'no data')
break
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
time.sleep(2) # 控制访问频率

107
iafd/src/load.py Normal file
View File

@ -0,0 +1,107 @@
import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import iafd_scraper as scraper
import utils
config.setup_logging()
res_dir = '/root/hostdir/scripts_data/iafd_202503'
# 演员列表
def load_performer_list(file, **from_fields):
json_data = utils.read_json(file)
if json_data is None:
json_data = []
total_rows = len(json_data)
loaded_rows = 0
succ = 0
for row in json_data:
row_id = db_tools.insert_performer_index(name=row.get('person', ''),
href=row.get('href', ''),
**from_fields
)
if row_id:
logging.debug(f'insert one person, id: {row_id}, person: {row['person']}, url: {row['href']}')
succ += 1
else:
logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
loaded_rows += 1
if loaded_rows % 10000 == 0:
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
logging.info(f'load data succ. file: {file}, rows: {total_rows}, succ rows: {succ}')
# movie 列表
def load_movie_list(file, **from_fields):
json_data = utils.read_json(file)
if json_data is None:
json_data = []
total_rows = len(json_data)
loaded_rows = 0
succ = 0
for row in json_data:
row_id = db_tools.insert_movie_index(title=row.get('title', ''),
href=row.get('href', ''),
release_year=utils.to_number(row['year']),
**from_fields
)
if row_id:
logging.debug(f'insert one movie, id: {row_id}, title: {row['title']}, url: {row['href']}')
succ += 1
else:
logging.warning(f'insert movie failed: {row['title']}, {row['href']} failed.')
loaded_rows += 1
if loaded_rows % 10000 == 0:
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
# 演员详情
def load_performers(file):
json_data = utils.read_json(file)
if json_data is None:
json_data = []
total_rows = len(json_data)
loaded_rows = 0
succ = 0
for row in json_data:
performer_id = db_tools.insert_or_update_performer(row)
if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: {row['person']}, url: {row['href']}')
succ += 1
else:
logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
loaded_rows += 1
if loaded_rows % 10000 == 0:
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
if __name__ == "__main__":
load_performer_list(f'{res_dir}/astro.json', from_astro_list=1)
time.sleep(3)
load_performer_list(f'{res_dir}/birth.json', from_birth_list=1)
time.sleep(3)
load_performer_list(f'{res_dir}/ethnic.json', from_ethnic_list=1)
time.sleep(3)
load_movie_list(f'{res_dir}/distributors.json', from_dist_list=1)
time.sleep(3)
load_movie_list(f'{res_dir}/studios.json', from_stu_list=1)
time.sleep(3)
load_performers(f'{res_dir}/performers.json')

848
iafd/src/sqlite_utils.py Normal file
View File

@ -0,0 +1,848 @@
import sqlite3
import json
import config
import utils
import logging
import sys
from datetime import datetime
# 连接 SQLite 数据库
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 获取当前时间
def get_current_time():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# """从指定表中通过 href 查找 id"""
def get_id_by_href(table: str, href: str) -> int:
if href is None:
return None
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
row = cursor.fetchone()
return row[0] if row else None
# 插入演员索引,来自于列表数据
def insert_performer_index(name, href, from_astro_list=None, from_birth_list=None, from_ethnic_list=None, from_movie_list=None):
try:
# **查询是否已存在该演员**
cursor.execute("""
SELECT id, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list
FROM iafd_performers WHERE href = ?
""", (href,))
existing_performer = cursor.fetchone()
if existing_performer: # **如果演员已存在**
performer_id, existing_name, existing_astro, existing_birth, existing_ethnic, existing_movie = existing_performer
# **如果没有传入值,则保持原有值**
from_astro_list = from_astro_list if from_astro_list is not None else existing_astro
from_birth_list = from_birth_list if from_birth_list is not None else existing_birth
from_ethnic_list = from_ethnic_list if from_ethnic_list is not None else existing_ethnic
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie
cursor.execute("""
UPDATE iafd_performers
SET name = ?,
from_astro_list = ?,
from_birth_list = ?,
from_ethnic_list = ?,
from_movie_list = ?,
updated_at = datetime('now', 'localtime')
WHERE href = ?
""", (name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list, href))
else: # **如果演员不存在,插入**
cursor.execute("""
INSERT INTO iafd_performers (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list)
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
""", (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list))
conn.commit()
performer_id = get_id_by_href('iafd_performers', href)
if performer_id:
logging.debug(f'Inserted/Updated performer index, id: {performer_id}, name: {name}, href: {href}')
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# """插入电影索引,来自于列表数据"""
def insert_movie_index(title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
try:
# **查询是否已存在该电影**
cursor.execute("""
SELECT id, title, release_year, from_performer_list, from_dist_list, from_stu_list
FROM iafd_movies WHERE href = ?
""", (href,))
existing_movie = cursor.fetchone()
if existing_movie: # **如果电影已存在**
movie_id, existing_title, existing_year, existing_performer, existing_dist, existing_stu = existing_movie
# **如果没有传入值,则保持原有值**
release_year = release_year if release_year != 0 else existing_year
from_performer_list = from_performer_list if from_performer_list is not None else existing_performer
from_dist_list = from_dist_list if from_dist_list is not None else existing_dist
from_stu_list = from_stu_list if from_stu_list is not None else existing_stu
cursor.execute("""
UPDATE iafd_movies
SET title = ?,
release_year = ?,
from_performer_list = ?,
from_dist_list = ?,
from_stu_list = ?,
updated_at = datetime('now', 'localtime')
WHERE href = ?
""", (title, release_year, from_performer_list, from_dist_list, from_stu_list, href))
else: # **如果电影不存在,插入**
cursor.execute("""
INSERT INTO iafd_movies (title, href, release_year, from_performer_list, from_dist_list, from_stu_list)
VALUES (?, ?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
""", (title, href, release_year, from_performer_list, from_dist_list, from_stu_list))
conn.commit()
movie_id = get_id_by_href('iafd_movies', href)
if movie_id:
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
return movie_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# 插入演员和电影的关联数据
def insert_performer_movie(performer_id, movie_id, role, notes):
try:
cursor.execute("""
INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes)
VALUES (?, ?, ?, ?)
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes, role=excluded.role
""",
(performer_id, movie_id, role, notes)
)
conn.commit()
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
return performer_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 插入电影和电影的关联数据
def insert_movie_appears_in(movie_id, appears_in_id, gradation=0, notes=''):
try:
cursor.execute("""
INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes)
VALUES (?, ?, ?, ?)
ON CONFLICT(movie_id, appears_in_id) DO UPDATE SET notes=excluded.notes, gradation=excluded.gradation
""",
(movie_id, appears_in_id, gradation, notes)
)
conn.commit()
#logging.debug(f'insert one movie_appears_in, movie_id: {movie_id}, appears_in_id: {appears_in_id}')
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 插入演员信息
def insert_or_update_performer(data):
try:
cursor.execute("""
INSERT INTO iafd_performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
blacked_cnt, tushy_cnt, x_art_cnt, is_full_data, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
gender = excluded.gender,
birthday = excluded.birthday,
astrology = excluded.astrology,
birthplace = excluded.birthplace,
years_active = excluded.years_active,
ethnicity = excluded.ethnicity,
nationality = excluded.nationality,
hair_colors = excluded.hair_colors,
eye_color = excluded.eye_color,
height_str = excluded.height_str,
weight_str = excluded.weight_str,
measurements = excluded.measurements,
tattoos = excluded.tattoos,
piercings = excluded.piercings,
weight = excluded.weight,
height = excluded.height,
movies_cnt = excluded.movies_cnt,
vixen_cnt = excluded.vixen_cnt,
blacked_cnt = excluded.blacked_cnt,
tushy_cnt = excluded.tushy_cnt,
x_art_cnt = excluded.x_art_cnt,
is_full_data = 1,
updated_at = datetime('now', 'localtime')
""", (
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')),
data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
))
# 获取 performer_id
performer_id = get_id_by_href('iafd_performers', data["href"])
if performer_id is None:
return None
logging.debug(f'insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}')
# 插入新的 alias
for alias in data.get("performer_aka") or []:
if alias.lower() != "no known aliases":
cursor.execute("INSERT OR IGNORE INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ", (performer_id, alias))
conn.commit()
# 插入影片列表,可能有 personal 和 director 两个身份
credits = data.get('credits', {})
for role, movies in credits.items():
if movies:
for movie in movies:
movie_id = get_id_by_href('iafd_movies', movie['href'])
# 影片不存在,先插入
if movie_id is None:
movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
if movie_id:
tmp_id = insert_performer_movie(performer_id, movie_id, role, movie['notes'])
if tmp_id :
logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}, role: {role}')
else:
logging.warning(f'insert performer_movie failed. performer_id: {performer_id}, moive href: {movie['href']}')
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_performer_404(name, href):
try:
cursor.execute("""
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
VALUES (?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
is_full_data = 1,
updated_at = datetime('now', 'localtime')
""", (
href, name
))
# 获取 performer_id
performer_id = get_id_by_href('iafd_performers', href)
if performer_id is None:
return None
logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}')
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# 按 id 或 href 删除演员
def delete_performer(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM iafd_performers WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM iafd_performers WHERE href = ?", (identifier,))
else:
logging.warning("无效的删除参数")
return
conn.commit()
logging.info(f"成功删除演员: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 按 id、href 或 name 查询演员信息
def query_performer(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM iafd_performers WHERE id = ?", (identifier,))
elif "http" in identifier:
cursor.execute("SELECT * FROM iafd_performers WHERE href = ?", (identifier,))
else:
cursor.execute("SELECT * FROM iafd_performers WHERE name LIKE ?", (f"%{identifier}%",))
performer = cursor.fetchone()
if performer:
cursor.execute("SELECT alias FROM iafd_performer_aliases WHERE performer_id = ?", (performer[0],))
aliases = [row[0] for row in cursor.fetchall()]
result = dict(zip([desc[0] for desc in cursor.description], performer))
result["performer_aka"] = aliases
return result
else:
logging.warning(f"未找到演员: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_performer_hrefs(**filters):
try:
sql = "SELECT href, name FROM iafd_performers WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
if "is_full_data" in filters:
sql += " AND is_full_data = ?"
params.append(filters["is_full_data"])
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新发行商 """
def insert_or_update_ethnic(data):
try:
cursor.execute("""
INSERT INTO iafd_meta_ethnic (name, href)
VALUES (?, ?)
ON CONFLICT(href) DO UPDATE SET
name = excluded.name
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM iafd_meta_ethnic WHERE href = ?", (data["href"],))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"成功插入/更新ethnic: {data['name']}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# 按条件查询 href 列表
def query_ethnic_hrefs(**filters):
try:
sql = "SELECT href, name FROM iafd_meta_ethnic WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "url" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新发行商 """
def insert_or_update_distributor(data):
try:
cursor.execute("""
INSERT INTO iafd_distributors (name, href, updated_at)
VALUES (?, ? , datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM iafd_distributors WHERE href = ?", (data["href"],))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# 删除发行商(按 id 或 name """
def delete_distributor(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM iafd_distributors WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM iafd_distributors WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除发行商: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 查询发行商(按 id 或 name """
def query_distributor(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM iafd_distributors WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM iafd_distributors WHERE name LIKE ?", (f"%{identifier}%",))
distributor = cursor.fetchone()
if distributor:
return dict(zip([desc[0] for desc in cursor.description], distributor))
else:
logging.warning(f"未找到发行商: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_distributor_hrefs(**filters):
try:
sql = "SELECT href FROM iafd_distributors WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "url" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# """ 插入或更新制作公司 """
def insert_or_update_studio(data):
try:
cursor.execute("""
INSERT INTO iafd_studios (name, href, updated_at)
VALUES (?, ?, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM iafd_studios WHERE href = ?", (data["href"],))
stu_id = cursor.fetchone()[0]
if stu_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return stu_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# """ 删除制作公司(按 id 或 name """
def delete_studio(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM iafd_studios WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM iafd_studios WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除制作公司: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# """ 查询制作公司(按 id 或 name """
def query_studio(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM iafd_studios WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM iafd_studios WHERE name LIKE ?", (f"%{identifier}%",))
studio = cursor.fetchone()
if studio:
return dict(zip([desc[0] for desc in cursor.description], studio))
else:
logging.warning(f"未找到制作公司: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_studio_hrefs(**filters):
try:
sql = "SELECT href FROM iafd_studios WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# """插入或更新电影数据"""
def insert_or_update_movie(movie_data):
try:
# 获取相关 ID
distributor_id = get_id_by_href('iafd_distributors', movie_data['DistributorHref'])
studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
# 导演不存在的话,插入一条
if director_id is None:
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'], from_movie_list=1)
if studio_id is None:
studio_id = 0
if distributor_id is None:
distributor_id = 0
# 插入或更新电影信息
cursor.execute(
"""
INSERT INTO iafd_movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
all_girl, all_male, compilation, webscene, director_id, href, is_full_data, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
studio_id=excluded.studio_id, release_date=excluded.release_date,
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
director_id=excluded.director_id, is_full_data=1, updated_at = datetime('now', 'localtime')
""",
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
)
conn.commit()
# 获取插入的 movie_id
movie_id = get_id_by_href('iafd_movies', movie_data['href'])
if movie_id is None:
return None
logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}')
# 插入 performers_movies 关系表
for performer in movie_data.get('Performers', []):
performer_id = get_id_by_href('iafd_performers', performer['href'])
# 如果演员不存在,先插入
if performer_id is None:
performer_id = insert_performer_index(performer['name'], performer['href'], from_movie_list=1)
if performer_id:
notes = '|'.join(tag for tag in performer['tags'] if tag != performer['name'])
tmp_id = insert_performer_movie(performer_id, movie_id, 'personal', notes)
if tmp_id:
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
else:
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
else:
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
# 插入 movies_appers_in 表
for appears in movie_data.get("AppearsIn", []):
appears_in_id = get_id_by_href('iafd_movies', appears['href'])
# 不存在,先插入
if appears_in_id is None:
appears_in_id = insert_movie_index( appears['title'], appears['href'])
if appears_in_id:
tmp_id = insert_movie_appears_in(movie_id, appears_in_id)
if tmp_id:
logging.debug(f'insert one movie_appears_in record. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
else:
logging.warning(f'insert movie_appears_in failed. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
else:
logging.warning(f'get appears_in_id failed. title: {appears['title']}, href: {appears['href']}')
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_movie_404(title, href):
try:
# 插入或更新电影信息
cursor.execute(
"""
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
VALUES (?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
""",
(title, href)
)
conn.commit()
# 获取插入的 movie_id
movie_id = get_id_by_href('iafd_movies', href)
if movie_id is None:
return None
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 删除电影数据"""
def delete_movie(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM iafd_movies WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM iafd_movies WHERE href = ?", (identifier,))
else:
logging.warning("无效的删除参数")
return
conn.commit()
logging.info(f"Deleted movie with {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error("Error deleting movie: %s", e)
# 查找电影数据"""
def query_movies(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM iafd_movies WHERE id = ?", (identifier,))
elif "http" in identifier:
cursor.execute("SELECT * FROM iafd_movies WHERE href = ?", (identifier,))
else:
cursor.execute("SELECT * FROM iafd_movies WHERE title LIKE ?", (f"%{identifier}%",))
movie = cursor.fetchone()
if movie:
cursor.execute("SELECT * FROM iafd_performers_movies WHERE performer_id = ?", (movie[0],))
performers = [row[0] for row in cursor.fetchall()]
result = dict(zip([desc[0] for desc in cursor.description], performers))
result["performers"] = performers
return result
else:
logging.warning(f"find no data: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_movie_hrefs(**filters):
try:
sql = "SELECT href, title FROM iafd_movies WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "title" in filters:
sql += " AND title LIKE ?"
params.append(f"%{filters['title']}%")
if "is_full_data" in filters:
sql += " AND is_full_data = ?"
params.append(filters["is_full_data"])
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
# 获取 view_iafd_performers_movies 中数据 不匹配的演员信息。
def get_performers_needed_update(limit=None):
try:
sql = """
SELECT href, name FROM view_iafd_performers_movies where actual_movies_cnt != movies_cnt
"""
if limit is not None:
sql += f" LIMIT {limit}"
cursor.execute(sql)
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
# 插入一条任务日志
def insert_task_log():
try:
cursor.execute("""
INSERT INTO iafd_task_log (task_status) VALUES ('Start')
""")
conn.commit()
task_id = cursor.lastrowid
if task_id is None:
return None
update_task_log(task_id=task_id, task_status='Start')
return task_id # 获取插入的 task_id
except sqlite3.Error as e:
logging.error(f"插入任务失败: {e}")
return None
# 更新任务日志的字段
def update_task_log_inner(task_id, **kwargs):
try:
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
params = list(kwargs.values()) + [task_id]
sql = f"UPDATE iafd_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
cursor.execute(sql, params)
conn.commit()
except sqlite3.Error as e:
logging.error(f"更新任务 {task_id} 失败: {e}")
# 更新任务日志的字段
def update_task_log(task_id, task_status):
try:
# 获取 performers、studios 等表的最终行数
cursor.execute("SELECT COUNT(*) FROM iafd_performers where is_full_data=1")
full_data_performers = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM iafd_performers")
total_performers = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM iafd_movies where is_full_data=1")
full_data_movies = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM iafd_movies")
total_movies = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
total_distributors = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM iafd_studios")
total_studios = cursor.fetchone()[0]
# 更新 task_log
update_task_log_inner(task_id,
full_data_performers=full_data_performers,
total_performers=total_performers,
full_data_movies=full_data_movies,
total_movies=total_movies,
total_distributors=total_distributors,
total_studios=total_studios,
task_status=task_status)
except sqlite3.Error as e:
logging.error(f"更新任务 {task_id} 失败: {e}")
# 任务结束,更新字段
def finalize_task_log(task_id):
try:
# 更新 task_log
update_task_log(task_id, task_status="Success")
except sqlite3.Error as e:
logging.error(f"任务 {task_id} 结束失败: {e}")
if __name__ == "__main__":
try:
with open('../result/detail.json', 'r') as file:
performers = json.load(file)
for performer in performers:
insert_or_update_performer(performer)
print(query_performer("Kirsten"))
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
print(query_performer_hrefs())
except FileNotFoundError:
logging.info("detail.json not found, starting fresh.")

101
iafd/src/utils.py Normal file
View File

@ -0,0 +1,101 @@
import re
import os
import json
import time
import csv
import logging
import config
# 解析 height 和 weight转换成数字
def parse_height(height_str):
return 0
try:
return int(height_str.split("(")[-1].replace(" cm)", ""))
except:
return None
def parse_weight(weight_str):
return 0
try:
return int(weight_str.split(" ")[0])
except:
return None
update_dir = f'{config.global_host_data_dir}/iafd'
performers_dir = f'{update_dir}/performers'
movies_dir = f'{update_dir}/movies'
def to_number(value):
"""将字符串转换为数字,如果无效则返回 0"""
try:
return float(value)
except (ValueError, TypeError):
return 0
def dist_stu_href_rewrite(href):
# 提取 ID适用于 distrib 或 studio
import re
match = re.search(r"(distrib|studio)=(\d+)", href)
if not match:
return None # 不是目标 URL返回 None
key, id_number = match.groups()
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
return new_url
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
# 获取目录
person_dir = create_sub_directory(performers_dir, person)
person_id = extract_id_from_href(href)
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
# 获取目录
movie_id = extract_id_from_href(href)
person_dir = create_sub_directory(movies_dir, movie_id)
person_filename = f"{movie_id}.json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 读取json文件并返回内容
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return None
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return None

26
iafd/src_json/config.py Normal file
View File

@ -0,0 +1,26 @@
import logging
import os
import inspect
from datetime import datetime
global_share_data_dir = '/root/sharedata'
global_host_data_dir = '/root/hostdir/scripts_data'
# 设置日志配置
def setup_logging(log_filename=None):
# 如果未传入 log_filename则使用当前脚本名称作为日志文件名
if log_filename is None:
# 获取调用 setup_logging 的脚本文件名
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
# 获取当前日期,格式为 yyyymmdd
current_date = datetime.now().strftime('%Y%m%d')
# 拼接 log 文件名,将日期加在扩展名前
log_filename = f'../log/{caller_filename}_{current_date}.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler()
])

View File

@ -0,0 +1,334 @@
import os
import json
import csv
import time
import logging
import sys
import signal
import re
import cloudscraper
from bs4 import BeautifulSoup
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
# 目录和文件路径
RESULT_DIR = "../result"
OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
BATCH_SIZE = 100 # 每100条数据写入文件
movies_dir = f'{RESULT_DIR}/movies'
# 初始化 Cloudflare 绕过工具
scraper = cloudscraper.create_scraper()
# 全量数据
all_movies = []
def load_existing_data():
"""加载已处理的数据,支持续传"""
if os.path.exists(OUTPUT_JSON):
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
try:
return json.load(f)
except json.JSONDecodeError:
return []
return []
def save_data():
"""保存数据到 JSON 和 CSV 文件"""
logging.info("Saving data...")
global all_movies
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(all_movies, f, indent=4, ensure_ascii=False)
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
"AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
for movie in all_movies:
writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
# 请求网页并返回 HTML 内容
def fetch_html(href):
"""请求网页并返回 HTML 内容"""
for attempt in range(3):
try:
response = scraper.get(href, timeout=10)
if response.status_code == 200:
return response.text
except Exception as e:
logging.warning(f"Error fetching {href}: {e}")
time.sleep(2)
logging.error(f"Failed to fetch {href} after 3 attempts")
return None
# 解析网页 HTML 并提取电影信息
def parse_movie_details(html, href, title):
"""解析网页 HTML 并提取电影信息"""
soup = BeautifulSoup(html, "html.parser")
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
performer["tags"] = [
tag.strip() for br in cast.find_all("br")
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
]
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
# 获取目录
movie_id = extract_id_from_href(href)
person_dir = create_sub_directory(movies_dir, movie_id)
person_filename = f"{movie_id}.json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
def process_movies():
"""处理电影数据"""
global all_movies
all_movies = load_existing_data()
processed_hrefs = {movie["href"] for movie in all_movies}
# 读取 distributors.json 文件
with open(INPUT_FILE, "r", encoding="utf-8") as f:
movies = json.load(f)
count = 0
for entry in movies:
href = entry["href"]
title = entry["title"]
if href in processed_hrefs:
logging.info(f"Skiping existed: {title} ({href})")
continue # 跳过已处理数据
logging.info(f"Processing: {title} ({href})")
while True:
html = fetch_html(href)
if not html:
logging.warning(f'Retring {title} ({href}) ')
continue # 获取失败,跳过
else:
movie = parse_movie_details(html, href, title)
if not movie:
logging.warning(f'Retring {title} ({href}) ')
continue
else:
all_movies.append(movie)
count += 1
# 写入本地文件
write_movie_json(href, movie)
break
# 每 BATCH_SIZE 条数据刷新一次文件
if count % BATCH_SIZE == 0:
save_data()
# 最终保存文件
save_data()
logging.info("Task completed.")
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
movie = {}
while True:
html = fetch_html(href)
if not html:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
movie = parse_movie_details(html, href, 'title')
if movie:
break
else:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
if movie:
write_movie_json(href, movie)
print(f'fetch succ. saved result in {movies_dir}')
# 处理程序被终止时的数据
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
save_data()
sys.exit(0)
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
process_movies()
finally:
# 清理操作,保证在程序正常退出时执行
save_data()
logging.info("Data processing completed.")
# 程序入口,读取参数
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()

View File

@ -0,0 +1,255 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
import argparse
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
# 结果路径
res_dir = f"{config.global_share_data_dir}/iafd"
fetch_config = {
'dist': {
'base_url': f"{host_url}/distrib.rme/distrib=",
'list_page_url': f"{host_url}/distrib.asp",
'html_table_id': 'distable',
'html_select_name': 'Distrib',
'output_key_id': 'distributors',
'json_file': f'{res_dir}/distributors.json',
'csv_file': f'{res_dir}/distributors.csv',
},
'stu': {
'base_url': f"{host_url}/studio.rme/studio=",
'list_page_url': f"{host_url}/studio.asp",
'html_table_id': 'studio',
'html_select_name': 'Studio',
'output_key_id': 'studios',
'json_file': f'{res_dir}/studios.json',
'csv_file': f'{res_dir}/studios.csv',
}
}
distr_map = {
6812 : 'nubilefilms.com',
8563 : 'teenmegaworld network',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
12454: 'vip4k.com',
13541: 'wow network',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
studio_map = {
6812 : 'nubilefilms.com',
9811 : 'Teen Mega World',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
8052: 'wowgirls.com',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
all_data = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, name, config):
table_id = config['html_table_id']
key_id = config['output_key_id']
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No {table_id} table found in {name}")
return None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
global all_data
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
all_data.append({
key_id: name,
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return soup
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 获取列表页
def process_list_gage(config):
list_page_url=config['list_page_url']
select_name = config['html_select_name']
list_map = {}
logging.info(f"Fetching data for {list_page_url} ...")
select_element = None
while True:
html = fetch_page(list_page_url)
if html:
soup = BeautifulSoup(html, "html.parser")
select_element = soup.find('select', {'name': select_name})
if select_element :
break
else:
logging.info(f"wrong html content. retring {list_page_url} ...")
else:
logging.info(f"wrong html content. retring {list_page_url} ...")
if not select_element:
return None
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
list_map[int(value)] = text
logging.info(f'fetch {list_page_url} succ. total lines: {len(list_map)}')
return list_map
# 主逻辑函数:循环处理每个种族
def process_main_data(list_data, config):
base_url = config['base_url']
for key, name in list_data.items():
url = base_url + str(key)
next_url = url
logging.info(f"Fetching data for {name}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, name, config)
if soup:
next_url = handle_pagination(soup, name)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data(config)
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data(config):
with open(config['json_file'], 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
with open(config['csv_file'], 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=[config['output_key_id'], 'title', 'label', 'year', 'rev', 'href'])
writer.writeheader()
writer.writerows(all_data)
# 执行主逻辑
if __name__ == '__main__':
# 命令行参数处理
parser = argparse.ArgumentParser(description='fetch movie list from iafd.com')
parser.add_argument('--type', type=str, default='dist', help='fetch by ... (dist , stu)')
parser.add_argument('--kind', type=str, default='parts', help='fetch all or parts (parts , all)')
args = parser.parse_args()
config = fetch_config[args.type]
if not config:
logging.warning(f'unkwon type: {args.type} {args.kind}')
else:
list_data = {}
if args.kind == 'all':
list_data = process_list_gage(config)
elif args.type == 'dist':
list_data = distr_map
else:
list_data = studio_map
process_main_data(list_data, config)
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,393 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import config
# 配置日志
config.setup_logging()
# 结果路径
res_dir = '../result'
res_json_file = f'{res_dir}/detail.json'
res_csv_file = f'{res_dir}/detail.csv'
input_json_file = f'{res_dir}/merged.json'
performers_dir = f'{res_dir}/performers'
# 存储结果
final_data = []
# 读取 detail.json 中的 数据,以便于断点续传
def load_existing_hrefs():
existing_hrefs = set()
global final_data
try:
with open(res_json_file, 'r') as file:
final_data = json.load(file)
for entry in final_data:
existing_hrefs.add(entry['href'])
except FileNotFoundError:
logging.info("detail.json not found, starting fresh.")
return existing_hrefs
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'year': year,
'distributor': distributor,
'notes': notes,
'rev': rev,
'formats': formats
})
return movies, distributor_count
# 请求网页并提取所需数据
def fetch_and_parse_page(url, scraper):
try:
response = scraper.get(url)
if response.status_code != 200:
logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None, None
# 解析 HTML 内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
return data, credits_list
except RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None, None
# 写入 detail.json
def write_to_detail_json(data):
with open(res_json_file, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
# 写入 CSV 文件
def write_to_csv(data):
try:
with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity',
'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings',
'movies_cnt', 'vixen_cnt', 'blacked_cnt', 'tushy_cnt', 'x_art_cnt']
writer.writerow(header)
for entry in data:
# 确保 performer_aka 始终为列表类型
performer_aka = entry.get('performer_aka', [])
# 如果是 None 或非列表类型,转换为一个空列表
if performer_aka is None:
performer_aka = []
elif not isinstance(performer_aka, list):
performer_aka = [performer_aka]
writer.writerow([
entry.get('person', ''),
entry.get('href', ''),
'|'.join(performer_aka),
entry.get('birthday', ''),
entry.get('astrology', ''),
entry.get('birthplace', ''),
entry.get('gender', ''),
entry.get('years_active', ''),
entry.get('ethnicity', ''),
entry.get('nationality', ''),
entry.get('hair_colors', ''),
entry.get('eye_color', ''),
entry.get('height', ''),
entry.get('weight', ''),
entry.get('measurements', ''),
entry.get('tattoos', ''),
entry.get('piercings', ''),
entry.get('movies_cnt', 0),
entry.get('vixen_cnt', 0),
entry.get('blacked_cnt', 0),
entry.get('tushy_cnt', 0),
entry.get('x_art_cnt', 0)
])
except Exception as e:
logging.error(f"Error writing to CSV: {e}")
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
write_to_csv(final_data) # Ensure final data is written when exiting
write_to_detail_json(final_data)
sys.exit(0)
# 创建目录
def create_directory_for_person(person):
# 获取 person 的前两个字母并转为小写
person_dir = person[:1].lower()
full_path = os.path.join(performers_dir, person_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
# 获取目录
person_dir = create_directory_for_person(person)
person_id = extract_id_from_href(href)
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
while True:
data, movies = fetch_and_parse_page(href, scraper)
if data is None:
logging.warning(f'Retring {href} ')
time.sleep(3)
else:
break
# 写入 performer 的独立 JSON 文件
full_data = {
**data,
'credits': movies if movies else {}
}
person_id = extract_id_from_href(href)
person_filename = f"{person_id}.json" # 用 - 替换空格
try:
with open(person_filename, 'w', encoding='utf-8') as json_file:
json.dump(full_data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {person_filename}: {e}")
print(f'fetch succ. saved result in {person_filename}')
def process_all():
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 加载已存在的 href 列表
global final_data
existing_hrefs = load_existing_hrefs()
logging.info(f"load data from {res_json_file}, count: {len(final_data)}")
# 读取 merged.json
with open(input_json_file, 'r') as file:
merged_data = json.load(file)
# 遍历 merged.json 中的数据
loop = 0
for entry in merged_data:
href = entry.get('href')
person = entry.get('person')
if href in existing_hrefs:
logging.info(f"Skipping {href} - already processed")
continue
logging.info(f"Processing {href} - {person}")
# 获取并解析数据
while True:
data, credits = fetch_and_parse_page(href, scraper)
if data is None:
logging.warning(f'Retring {href} - {person} ')
time.sleep(3)
else:
break
# 如果数据正确,加入到 final_data
final_data.append({
'href': href,
'person': person,
**data
})
# 写入 performer 的独立 JSON 文件
full_data = {
'href': href,
'person': person,
**data,
'credits': credits if credits else {}
}
write_person_json(person.strip(), href, full_data)
# 更新 detail.json 文件
loop = loop + 1
if loop % 100 == 0:
logging.info(f'flush data to json file. now fetched data count: {loop}, total count: {len(final_data)}')
write_to_detail_json(final_data)
write_to_csv(final_data)
# 更新已存在的 href
existing_hrefs.add(href)
# 延时,防止请求过快被封锁
time.sleep(1)
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
process_all()
finally:
# 清理操作,保证在程序正常退出时执行
write_to_csv(final_data) # Write to CSV or other necessary tasks
write_to_detail_json(final_data) # Save data to JSON
logging.info("Data processing completed.")
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()

View File

@ -0,0 +1,140 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = '../result'
# 记录 ethinc_map
astro_map = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, astro):
soup = BeautifulSoup(html, "html.parser")
astro_div = soup.find("div", id="astro")
if not astro_div:
logging.warning(f"Warning: No 'astro' div found in {astro}")
return None
flag = False
list_cnt = 0
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = host_url + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
astro_map.append({
"astrology": astro,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
flag = True
list_cnt = list_cnt +1
if flag:
logging.info(f"get {list_cnt} persons from this page. total persons: {len(astro_map)}")
return soup
else:
return None
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 主逻辑函数:循环处理每个种族
def process_astro_data():
for astro in astro_list:
url = base_url + astro
next_url = url
logging.info(f"Fetching data for {astro}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, astro)
if soup:
next_url = handle_pagination(soup, astro)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data():
with open(f'{res_dir}/astro.json', 'w', encoding='utf-8') as json_file:
json.dump(astro_map, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/astro.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['astrology', 'birth_date', 'person', 'href'])
writer.writeheader()
writer.writerows(astro_map)
# 执行主逻辑
if __name__ == '__main__':
process_astro_data()
save_data()
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,152 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import requests
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 创建 cloudscraper 会话
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = '../result'
# 存储出生日期的映射
birth_map = []
# 设置基础URL
host_url = "https://www.iafd.com"
base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
# 定义获取页面内容的函数
def fetch_page(month, day):
url = base_url.format(month=month, day=day)
retries = 3
while retries > 0:
try:
# 发送请求并获取页面
logging.info(f"Fetching URL: {url}")
response = scraper.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logging.error(f"Request failed: {e}")
retries -= 1
time.sleep(2) # 等待2秒后重试
return None
# 解析页面内容并更新birth_map
def parse_page(html, month, day):
soup = BeautifulSoup(html, 'html.parser')
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
if not datarows:
return None
flag = False
list_cnt = 0
rows = datarows[0].find_all('div', class_='col-sm-4')
for row in rows:
link_tag = row.find('a')
person = link_tag.text.strip() if link_tag else ''
href = link_tag['href'] if link_tag else ''
href = host_url + href
# 如果 href 已经在 birth_map 中,跳过
flag = True
if any(entry['href'] == href for entry in birth_map):
continue
# 将数据添加到 birth_map
birth_map.append({
'month': month,
'day': day,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.info(f"get {list_cnt} persons from this page. total persons: {len(birth_map)}")
return soup
else:
return None
# 循环遍历每个日期
def fetch_birthdays():
for month in range(1, 13): # 遍历1到12月
for day in range(1, 32): # 遍历1到31天
logging.info(f"Processing: Month {month}, Day {day}")
while True:
html = fetch_page(month, day)
if html:
soup = parse_page(html, month, day)
if soup:
# 定期保存结果
save_data()
# 跳出while循环获取下一个生日的url数据
time.sleep(2) # 控制访问频率
break
else:
logging.warning(f"No data. Retrying: Month {month}, Day {day}")
time.sleep(3) # 等待后再重试
else:
logging.warning(f"Network error. Retrying: Month {month}, Day {day}")
time.sleep(3) # 等待后再重试
# 将birth_map保存到json文件
def save_data():
with open(f'{res_dir}/birth.json', 'w', encoding='utf-8') as f:
json.dump(birth_map, f, ensure_ascii=False, indent=4)
with open(f'{res_dir}/birth.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['month', 'day', 'person', 'href'])
writer.writeheader()
for entry in birth_map:
writer.writerow(entry)
# 主函数
def main():
# 获取数据
fetch_birthdays()
# 保存结果
save_data()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,166 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/lookupethnic.rme/ethnic="
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = '../result'
# 记录 ethinc_map
ethnic_map = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, ethnic):
# 手动修复 HTML 标签
html = html.replace('<br>', '').replace('<a ', '<a target="_blank" ') # 修复一些不规范标签
soup = BeautifulSoup(html, 'lxml') # 使用lxml解析器
#soup = BeautifulSoup(html, 'html.parser')
rows = soup.find_all('div', class_='row headshotrow')
flag = False
list_cnt = 0
for row in rows:
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
link_tag = col.find('a')
img_tag = col.find('div', class_='pictag')
flag = True
if link_tag and img_tag:
href = host_url + link_tag['href']
person = img_tag.text.strip()
# 将数据存储到 ethnic_map
ethnic_map.append({
'ethnic': ethnic,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.info(f"get {list_cnt} persons from this page. total persons: {len(ethnic_map)}")
return soup
else:
return None
# 处理翻页
def handle_pagination(soup, ethnic):
next_page = soup.find('a', rel='next')
if next_page:
next_url = host_url + next_page['href']
logging.info(f"Found next page: {next_url}")
return next_url
else:
logging.info(f"All pages fetched for {ethnic}.")
return None
# 处理带空格的种族名
def format_ethnic(ethnic):
return ethnic.replace(' ', '+')
# 主逻辑函数:循环处理每个种族
def process_ethnic_data():
all_person = len(ethnic_map) # 应该为0
all_pages = 0
for ethnic in ethnic_list:
url = base_url + format_ethnic(ethnic)
next_url = url
cursor = int(all_person / 100)
pages = 0
logging.info(f"--------Fetching data for {ethnic}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, ethnic)
if soup:
next_url = handle_pagination(soup, ethnic)
pages = pages + 1
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 统计,并定期保存结果
if len(ethnic_map) / 100 > cursor:
cursor = int(len(ethnic_map) / 100)
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 统计输出
ethnic_person = len(ethnic_map) - all_person
all_person = len(ethnic_map)
all_pages = all_pages + pages
logging.info(f"--------Fetching data for {ethnic} end. total pages: {pages}, total persons: {ethnic_person}, all persons fetched: {all_person}")
# 统计最后结果
logging.info(f"--------Fetching all data end. total ethnic: {len(ethnic_list)}, total pages: {all_pages}, total persons: {all_person}")
# 保存到文件
def save_data():
with open(f'{res_dir}/ethnic.json', 'w', encoding='utf-8') as json_file:
json.dump(ethnic_map, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/ethnic.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['ethnic', 'person', 'href'])
writer.writeheader()
writer.writerows(ethnic_map)
# 执行主逻辑
if __name__ == '__main__':
process_ethnic_data()
save_data()
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,120 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import json
import csv
import os
import argparse
from collections import defaultdict
# 结果路径
res_dir = '../result'
# 读取文件并返回内容
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return []
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return []
# 处理数据,去重并合并 person 字段
def process_data(files):
href_map = defaultdict(list)
# 读取并处理每个文件
for file in files:
data = read_json(file['path'])
for entry in data:
href = entry.get('href')
person = entry.get('person')
if href:
href_map[href].append(person)
# 合并相同 href 的 person连接用 "|"
result = []
for href, persons in href_map.items():
person = '|'.join(set(persons)) # 去重后合并
result.append({'href': href, 'person': person})
return result
# 保存结果到JSON文件
def save_to_json(data, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
# 保存结果到CSV文件
def save_to_csv(data, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['href', 'person'])
writer.writeheader()
writer.writerows(data)
# 主函数,执行数据处理并保存
def main():
# 使用 argparse 获取命令行参数
parser = argparse.ArgumentParser(description="合并多个 JSON 文件并输出到一个新的 JSON 和 CSV 文件")
parser.add_argument('files', nargs='+', choices=['birth', 'astro', 'ethnic'],
help="指定需要合并的文件, 至少两个, 最多三个: birth, astro, ethnic")
args = parser.parse_args()
# 确保至少选择两个文件
if len(args.files) < 2:
print("请至少选择两个文件进行合并。")
return
# 定义需要处理的文件
file_map = {
'birth': f'{res_dir}/birth.json',
'astro': f'{res_dir}/astro.json',
'ethnic': f'{res_dir}/ethnic.json'
}
files = [{'path': file_map[file], 'name': file} for file in args.files]
# 处理数据
processed_data = process_data(files)
# 根据输入的文件名生成 merged 文件名
output_json_file = f'{res_dir}/merged_{"_".join(args.files)}.json'
output_csv_file = f'{res_dir}/merged_{"_".join(args.files)}.csv'
# 确保 result 目录存在
os.makedirs(f'{res_dir}', exist_ok=True)
# 输出结果到 JSON 和 CSV 文件
save_to_json(processed_data, output_json_file)
save_to_csv(processed_data, output_csv_file)
print(f"数据处理完成,结果已保存到 {output_json_file}{output_csv_file}")
if __name__ == "__main__":
main()

236
iafd/tools/data_merge.py Normal file
View File

@ -0,0 +1,236 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import os
import json
import csv
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 输入目录和输出文件
input_dir = 'data'
output_json_file = f'{input_dir}/iafd_merge.json'
output_csv_file = f'{input_dir}/iafd_merge.csv'
output_person_txt = f'{input_dir}/all_person.txt'
# 读取iafd_meta.json
try:
with open(os.path.join(input_dir, 'iafd_meta.json'), 'r', encoding='utf-8') as file:
iafd_data = json.load(file)
logger.info("Loaded iafd_meta.json")
except Exception as e:
logger.error(f"Error loading iafd_meta.json: {e}")
iafd_data = []
# 读取stashdb.json
try:
with open(os.path.join(input_dir, 'stashdb.json'), 'r', encoding='utf-8') as file:
stashdb_data = json.load(file)
logger.info("Loaded stashdb.json")
except Exception as e:
logger.error(f"Error loading stashdb.json: {e}")
stashdb_data = []
# 读取javhd_meta.json
try:
with open(os.path.join(input_dir, 'javhd_meta.json'), 'r', encoding='utf-8') as file:
javhd_data = json.load(file)
logger.info("Loaded javhd_meta.json")
except Exception as e:
logger.error(f"Error loading javhd_meta.json: {e}")
javhd_data = []
# 读取thelordofporn_meta.json
try:
with open(os.path.join(input_dir, 'thelordofporn_meta.json'), 'r', encoding='utf-8') as file:
lordporn_data = json.load(file)
logger.info("Loaded thelordofporn_meta.json")
except Exception as e:
logger.error(f"Error loading thelordofporn_meta.json: {e}")
lordporn_data = []
# 构建all_meta_data去重
all_meta_data = set()
# 从各数据源提取unique的姓名数据
for person_entry in iafd_data:
all_meta_data.add(person_entry['person'])
for stashdb_entry in stashdb_data:
all_meta_data.add(stashdb_entry['name'])
for javhd_entry in javhd_data:
all_meta_data.add(javhd_entry['ja_name'])
for lordporn_entry in lordporn_data:
all_meta_data.add(lordporn_entry['pornstar'])
# 合并数据的列表
merged_data = []
# 遍历all_meta_data按规则合并
for person in all_meta_data:
# 初始化合并的数据结构体
merged_entry = {
'person': person
}
# 初始化stashdb_entry所有字段为空
stashdb_entry = {
'stashdb_gender': '',
'stashdb_birthdate': '',
'stashdb_ethnicity': '',
'stashdb_country': '',
'stashdb_height': '',
'stashdb_measurements': '',
'stashdb_fake_tits': '',
'stashdb_career_length': '',
'stashdb_aliases': ''
}
# 初始化javhd_entry所有字段为空
javhd_entry = {
'javhd_rank': '',
'javhd_height': '',
'javhd_weight': '',
'javhd_breast_size': '',
'javhd_breast_factor': '',
'javhd_birth_date': '',
'javhd_ethnicity': ''
}
# 初始化lordporn_entry所有字段为空
lordporn_entry = {
'lordporn_rating': '',
'lordporn_rank': '',
'lordporn_career_start': '',
'lordporn_measurements': '',
'lordporn_born': '',
'lordporn_height': '',
'lordporn_weight': ''
}
# 初始化in_iafd字段默认为N
in_iafd = 'N'
iafd_match = next((item for item in iafd_data if item.get('person') == person), None)
if iafd_match:
in_iafd = 'Y'
# 1. 检查是否存在于 stashdb 数据
in_stashdb = 'N'
stashdb_match = next((item for item in stashdb_data if item.get('name') == person), None)
if stashdb_match:
in_stashdb = 'Y'
# 更新stashdb_entry字段
stashdb_entry.update({
'stashdb_gender': stashdb_match.get('gender', ''),
'stashdb_birthdate': stashdb_match.get('birthdate', ''),
'stashdb_ethnicity': stashdb_match.get('ethnicity', ''),
'stashdb_country': stashdb_match.get('country', ''),
'stashdb_height': stashdb_match.get('height', ''),
'stashdb_measurements': stashdb_match.get('measurements', ''),
'stashdb_fake_tits': stashdb_match.get('fake_tits', ''),
'stashdb_career_length': stashdb_match.get('career_length', ''),
'stashdb_aliases': stashdb_match.get('aliases', '')
})
# 2. 检查是否存在于 javhd 数据
in_javhd = 'N'
javhd_match = next((item for item in javhd_data if item.get('ja_name') == person), None)
if javhd_match:
in_javhd = 'Y'
# 更新javhd_entry字段
javhd_entry.update({
'javhd_rank': javhd_match.get('rank', ''),
'javhd_height': javhd_match.get('height', ''),
'javhd_weight': javhd_match.get('weight', ''),
'javhd_breast_size': javhd_match.get('breast size', ''),
'javhd_breast_factor': javhd_match.get('breast factor', ''),
'javhd_birth_date': javhd_match.get('birth date', ''),
'javhd_ethnicity': javhd_match.get('ethnicity', '')
})
# 3. 检查是否存在于 thelordofporn 数据
in_lordporn = 'N'
lordporn_match = next((item for item in lordporn_data if item.get('pornstar') == person), None)
if lordporn_match:
in_lordporn = 'Y'
# 更新lordporn_entry字段
lordporn_entry.update({
'lordporn_rating': lordporn_match.get('rating', ''),
'lordporn_rank': lordporn_match.get('rank', ''),
'lordporn_career_start': lordporn_match.get('career_start', ''),
'lordporn_measurements': lordporn_match.get('measurements', ''),
'lordporn_born': lordporn_match.get('born', ''),
'lordporn_height': lordporn_match.get('height', ''),
'lordporn_weight': lordporn_match.get('weight', '')
})
# 添加 in_stashdb, in_javhd, in_lordporn 字段,确保都输出
merged_entry.update({
'in_iafd': in_iafd,
'in_stashdb': in_stashdb,
'in_javhd': in_javhd,
'in_lordporn': in_lordporn
})
# 将stashdb_entry, javhd_entry, lordporn_entry合并到结果中
merged_entry.update(stashdb_entry)
merged_entry.update(javhd_entry)
merged_entry.update(lordporn_entry)
# 将合并后的条目加入到结果列表
merged_data.append(merged_entry)
# 写入iafd_merge.json
try:
with open(output_json_file, 'w', encoding='utf-8') as json_file:
json.dump(merged_data, json_file, ensure_ascii=False, indent=4)
logger.info(f"Data successfully written to {output_json_file}")
except Exception as e:
logger.error(f"Error writing {output_json_file}: {e}")
# 写入iafd_merge.csv
try:
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=merged_data[0].keys(), delimiter='\t')
writer.writeheader()
writer.writerows(merged_data)
logger.info(f"Data successfully written to {output_csv_file}")
except Exception as e:
logger.error(f"Error writing {output_csv_file}: {e}")
# 输出 all_meta_data 到 all_person.txt并按字母顺序排序
try:
# 排序 all_meta_data
all_meta_data_list = sorted(list(all_meta_data)) # 将集合转换为列表并排序
all_meta_data_str = ','.join(all_meta_data_list) # 使用逗号连接元素
with open(output_person_txt, 'w', encoding='utf-8') as txt_file:
txt_file.write(all_meta_data_str)
logger.info(f"all_meta_data successfully written to all_person.txt")
except Exception as e:
logger.error(f"Error writing all_person.txt: {e}")

163
iafd/tools/iafd_scrape.py Normal file
View File

@ -0,0 +1,163 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import json
import os
import subprocess
import time
import logging
from typing import List
# 设置日志配置
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 预定义的 scrapers 目录
scrapers_dir = "/root/gitlabs/stashapp_CommunityScrapers/scrapers"
meta_file = "./data/iafd_meta.json"
cursor_file = "./data/iafd_cursor.txt"
output_dir = f"{scrapers_dir}/iafd_meta"
# 重试次数和间隔
MAX_RETRIES = 10
RETRY_DELAY = 5 # 5秒重试间隔
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
def read_processed_hrefs() -> set:
"""
读取已经处理过的 href
"""
processed_hrefs = set()
if os.path.exists(cursor_file):
with open(cursor_file, "r", encoding="utf-8") as f:
processed_hrefs = {line.strip().split(",")[1] for line in f if "," in line}
return processed_hrefs
def execute_scraper_command(href: str, idv: str) -> bool:
"""
执行命令抓取数据成功则返回True否则返回False。
包含重试机制。
"""
command = f"cd {scrapers_dir}; python3 -m IAFD.IAFD performer {href} > {output_dir}/{idv}.json"
attempt = 0
while attempt < MAX_RETRIES:
try:
logger.info(f"执行命令: {command}")
subprocess.run(command, shell=True, check=True)
return True
except subprocess.CalledProcessError as e:
logger.error(f"执行命令失败: {e}. 重试 {attempt + 1}/{MAX_RETRIES}...")
time.sleep(RETRY_DELAY)
attempt += 1
logger.error(f"命令执行失败,已尝试 {MAX_RETRIES} 次: {command}")
return False
def validate_json_file(idv: str) -> bool:
"""
校验 JSON 文件是否有效
"""
output_file = f"{output_dir}/{idv}.json"
try:
with open(output_file, "r", encoding="utf-8") as f:
content = f.read().strip()
json_data = json.loads(content) # 尝试解析 JSON
if "name" not in json_data:
raise ValueError("缺少 'name' 字段")
return True
except (json.JSONDecodeError, ValueError) as e:
logger.error(f"解析失败,删除无效文件: {output_file}. 错误: {e}")
os.remove(output_file)
return False
def process_iafd_meta(data: List[dict], processed_hrefs: set) -> None:
"""
处理 iafd_meta.json 中的数据
"""
for entry in data:
person = entry.get("person")
href = entry.get("href")
if not person or not href:
logger.warning(f"跳过无效数据: {entry}")
continue
# 解析 href 提取 id
try:
idv = href.split("id=")[-1]
except IndexError:
logger.error(f"无法解析 ID: {href}")
continue
output_file = f"{output_dir}/{idv}.json"
# 跳过已处理的 href
if href in processed_hrefs:
logger.info(f"已处理,跳过: {person}, {href}")
continue
# 执行数据抓取
if not execute_scraper_command(href, idv):
continue
# 校验 JSON 文件
if not validate_json_file(idv):
continue
# 记录已处理数据
with open(cursor_file, "a", encoding="utf-8") as f:
f.write(f"{person},{href}\n")
logger.info(f"成功处理: {person} - {href}")
def main():
"""
主程序执行函数
"""
# 读取已处理的 href
processed_hrefs = read_processed_hrefs()
# 读取 iafd_meta.json 数据
try:
with open(meta_file, "r", encoding="utf-8") as f:
data = json.load(f)
except json.JSONDecodeError as e:
logger.error(f"读取 iafd_meta.json 错误: {e}")
return
# 处理数据
process_iafd_meta(data, processed_hrefs)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,90 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import os
import json
import csv
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 输入和输出目录
input_dir = 'data/tmp' # 假设metadata目录在当前目录下
output_json_file = 'stashdb.json'
output_csv_file = 'stashdb.csv'
# 用于保存所有的条目
data_list = []
# 遍历metadata文件夹读取所有json文件
for filename in os.listdir(input_dir):
if filename.endswith('.json'):
file_path = os.path.join(input_dir, filename)
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
# 提取需要的字段
person = {
'name': data.get('name'),
'gender': data.get('gender'),
'birthdate': data.get('birthdate'),
'ethnicity': data.get('ethnicity'),
'country': data.get('country'),
'height': data.get('height'),
'measurements': data.get('measurements'),
'fake_tits': data.get('fake_tits'),
'career_length': data.get('career_length'),
'aliases': ', '.join(data.get('aliases', [])) # 连接aliases数组元素
}
# 将数据添加到列表中
data_list.append(person)
logger.info(f"Processed file: {filename}")
except Exception as e:
logger.error(f"Error processing file {filename}: {e}")
# 输出到 JSON 文件
try:
with open(output_json_file, 'w', encoding='utf-8') as json_file:
json.dump(data_list, json_file, ensure_ascii=False, indent=4)
logger.info(f"Data successfully written to {output_json_file}")
except Exception as e:
logger.error(f"Error writing JSON file: {e}")
# 输出到 CSV 文件
try:
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=data_list[0].keys())
writer.writeheader()
writer.writerows(data_list)
logger.info(f"Data successfully written to {output_csv_file}")
except Exception as e:
logger.error(f"Error writing CSV file: {e}")