modify scripts
This commit is contained in:
101
iafd/merge/auto_tag.py
Normal file
101
iafd/merge/auto_tag.py
Normal file
@ -0,0 +1,101 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 数据库连接
|
||||
DB_PATH = 'your_database.db' # 数据库路径,修改为实际路径
|
||||
# 预定义标签,方便修改
|
||||
TAG_LIST = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||
|
||||
# 预加载标签 ID
|
||||
def get_all_tag_ids():
|
||||
try:
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
cursor = conn.cursor()
|
||||
#cursor.execute("SELECT id, name FROM tags WHERE name IN ('vixen', 'blacked', 'tushy', 'x-art')")
|
||||
cursor.execute("SELECT id, name FROM tags WHERE name IN ({})".format(', '.join(['?']*len(TAG_LIST))), TAG_LIST)
|
||||
tags = cursor.fetchall()
|
||||
# 创建标签名到 tag_id 的映射
|
||||
return {tag_name.lower(): tag_id for tag_id, tag_name in tags}
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching tag IDs: {e}")
|
||||
return {}
|
||||
|
||||
# 批量查找 performers 的 performer_id
|
||||
def get_performers_ids(performer_names):
|
||||
try:
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
cursor = conn.cursor()
|
||||
query = "SELECT id, name FROM performers WHERE LOWER(name) IN ({})".format(
|
||||
','.join(['?'] * len(performer_names))
|
||||
)
|
||||
cursor.execute(query, [name.lower() for name in performer_names])
|
||||
performers = cursor.fetchall()
|
||||
return {performer_name.lower(): performer_id for performer_id, performer_name in performers}
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching performer IDs: {e}")
|
||||
return {}
|
||||
|
||||
# 插入数据到 performers_tags 表
|
||||
def insert_performer_tag(performer_id, tag_id):
|
||||
try:
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
cursor = conn.cursor()
|
||||
# 检查 performers_tags 中是否已有此条数据
|
||||
cursor.execute("SELECT 1 FROM performers_tags WHERE performer_id = ? AND tag_id = ?", (performer_id, tag_id))
|
||||
if not cursor.fetchone():
|
||||
cursor.execute("INSERT INTO performers_tags (performer_id, tag_id) VALUES (?, ?)", (performer_id, tag_id))
|
||||
conn.commit()
|
||||
logger.info(f"Inserted performer_id {performer_id} and tag_id {tag_id} into performers_tags.")
|
||||
else:
|
||||
logger.info(f"Entry for performer_id {performer_id} and tag_id {tag_id} already exists in performers_tags.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error inserting into performers_tags: {e}")
|
||||
|
||||
# 处理 detail.json 文件
|
||||
def process_detail_json(detail_file):
|
||||
try:
|
||||
with open(detail_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# 获取所有标签的 ID
|
||||
tag_ids = get_all_tag_ids()
|
||||
|
||||
# 收集需要查询的 performers.name
|
||||
performer_names = [entry.get('person') for entry in data]
|
||||
|
||||
# 批量查询 performers.id
|
||||
performer_ids = get_performers_ids(performer_names)
|
||||
|
||||
for entry in data:
|
||||
person = entry.get('person')
|
||||
vixen_cnt = entry.get('vixen_cnt', 0)
|
||||
blacked_cnt = entry.get('blacked_cnt', 0)
|
||||
tushy_cnt = entry.get('tushy_cnt', 0)
|
||||
x_art_cnt = entry.get('x_art_cnt', 0)
|
||||
|
||||
# 获取 performer_id
|
||||
performer_id = performer_ids.get(person.lower())
|
||||
if not performer_id:
|
||||
continue # 如果找不到 performer_id,跳过此条数据
|
||||
|
||||
# 处理每个 tag(vixen, blacked, tushy, x-art)
|
||||
for tag_name, count in zip(TAG_LIST, [vixen_cnt, blacked_cnt, tushy_cnt, x_art_cnt]):
|
||||
if count > 0:
|
||||
tag_id = tag_ids.get(tag_name)
|
||||
if tag_id:
|
||||
insert_performer_tag(performer_id, tag_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {detail_file}: {e}")
|
||||
|
||||
# 主函数
|
||||
def main():
|
||||
detail_file = 'detail.json' # 输入文件路径,可以替换成实际路径
|
||||
process_detail_json(detail_file)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
72
iafd/merge/json2csv.py
Normal file
72
iafd/merge/json2csv.py
Normal file
@ -0,0 +1,72 @@
|
||||
import json
|
||||
import csv
|
||||
|
||||
# 读取 detail_birth.json 文件
|
||||
def read_json(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"文件 {file_path} 未找到.")
|
||||
return []
|
||||
except json.JSONDecodeError:
|
||||
print(f"文件 {file_path} 解析错误.")
|
||||
return []
|
||||
|
||||
# 写入 CSV 文件
|
||||
def write_to_csv(data, output_file):
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=[
|
||||
'person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender',
|
||||
'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height',
|
||||
'weight', 'measurements', 'tattoos', 'piercings'
|
||||
])
|
||||
writer.writeheader()
|
||||
for entry in data:
|
||||
# 确保 performer_aka 始终为列表类型
|
||||
performer_aka = entry.get('performer_aka', [])
|
||||
|
||||
# 如果是 None 或非列表类型,转换为一个空列表
|
||||
if performer_aka is None:
|
||||
performer_aka = []
|
||||
elif not isinstance(performer_aka, list):
|
||||
performer_aka = [performer_aka]
|
||||
|
||||
# 写入每一行
|
||||
writer.writerow({
|
||||
'person': entry.get('person', ''),
|
||||
'href': entry.get('href', ''),
|
||||
'performer_aka': performer_aka,
|
||||
'birthday': entry.get('birthday', ''),
|
||||
'astrology': entry.get('astrology', ''),
|
||||
'birthplace': entry.get('birthplace', ''),
|
||||
'gender': entry.get('gender', ''),
|
||||
'years_active': entry.get('years_active', ''),
|
||||
'ethnicity': entry.get('ethnicity', ''),
|
||||
'nationality': entry.get('nationality', ''),
|
||||
'hair_colors': entry.get('hair_colors', ''),
|
||||
'eye_color': entry.get('eye_color', ''),
|
||||
'height': entry.get('height', ''),
|
||||
'weight': entry.get('weight', ''),
|
||||
'measurements': entry.get('measurements', ''),
|
||||
'tattoos': entry.get('tattoos', ''),
|
||||
'piercings': entry.get('piercings', '')
|
||||
})
|
||||
|
||||
# 主函数,执行转化操作
|
||||
def main():
|
||||
# 输入的 JSON 文件路径
|
||||
input_json_file = 'detail_birth.json'
|
||||
# 输出的 CSV 文件路径
|
||||
output_csv_file = 'detail_birth.csv'
|
||||
|
||||
# 读取 JSON 文件
|
||||
data = read_json(input_json_file)
|
||||
|
||||
# 将数据写入 CSV 文件
|
||||
write_to_csv(data, output_csv_file)
|
||||
|
||||
print(f"数据已保存到 {output_csv_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
120
iafd/merge/url_match.py
Normal file
120
iafd/merge/url_match.py
Normal file
@ -0,0 +1,120 @@
|
||||
import json
|
||||
import logging
|
||||
import cloudscraper
|
||||
import time
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
test_flag = True
|
||||
|
||||
# 读取stashdb.json
|
||||
def read_json(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
return json.load(file)
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File {file_path} not found.")
|
||||
return []
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Error decoding JSON from {file_path}.")
|
||||
return []
|
||||
|
||||
# 请求URL并获取重定向后的URL
|
||||
def fetch_real_url_2(url, scraper):
|
||||
try:
|
||||
response = scraper.get(url, allow_redirects=True)
|
||||
if response.status_code == 200:
|
||||
return response.url # 获取最终的URL
|
||||
else:
|
||||
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
|
||||
return None
|
||||
except RequestException as e:
|
||||
logger.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def fetch_real_url(url, scraper):
|
||||
try:
|
||||
# 请求URL,禁止自动重定向
|
||||
response = scraper.get(url, allow_redirects=False)
|
||||
|
||||
# 检查是否是302响应,并获取Location头部的URL
|
||||
if response.status_code == 302 or response.status_code == 301:
|
||||
redirect_url = response.headers.get("Location")
|
||||
if redirect_url:
|
||||
logger.info(f"Redirected to: {redirect_url}")
|
||||
return redirect_url
|
||||
else:
|
||||
logger.warning(f"Redirect response received, but no Location header found for {url}")
|
||||
return None
|
||||
else:
|
||||
logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
|
||||
return None
|
||||
except RequestException as e:
|
||||
logger.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
# 处理每个 URL
|
||||
def process_urls(data, scraper):
|
||||
loop = 0
|
||||
global test_flag
|
||||
|
||||
for entry in data:
|
||||
iafd_urls = entry.get('iafd_urls', [])
|
||||
real_urls = []
|
||||
|
||||
for url in iafd_urls:
|
||||
if 'perfid=' in url:
|
||||
# 如果是重定向链接,访问并获取重定向后的URL
|
||||
real_url = fetch_real_url(url, scraper)
|
||||
if real_url:
|
||||
real_urls.append(real_url)
|
||||
# 测试时,执行小批量数据
|
||||
loop = loop + 1
|
||||
if test_flag and loop >10:
|
||||
return data
|
||||
|
||||
elif 'person.rme/id=' in url:
|
||||
# 非perfid链接直接添加
|
||||
real_urls.append(url)
|
||||
else:
|
||||
# 非perfid链接直接添加
|
||||
real_urls.append(url)
|
||||
logger.warning(f"unkown url format: {url}")
|
||||
|
||||
# 更新iafd_real_url字段
|
||||
entry['iafd_real_url'] = real_urls
|
||||
|
||||
return data
|
||||
|
||||
# 保存处理后的结果到 result.json
|
||||
def save_to_json(data, output_file):
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as file:
|
||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||
logger.info(f"Data saved to {output_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving to {output_file}: {e}")
|
||||
|
||||
# 主函数
|
||||
def main():
|
||||
# 读取输入文件
|
||||
input_file = 'stashdb.json'
|
||||
output_file = 'result.json'
|
||||
|
||||
# 创建cloudscraper对象
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 读取stashdb.json中的数据
|
||||
data = read_json(input_file)
|
||||
|
||||
# 处理每个 URL,获取重定向后的URL
|
||||
processed_data = process_urls(data, scraper)
|
||||
|
||||
# 保存结果到 result.json
|
||||
save_to_json(processed_data, output_file)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
87
iafd/src/config.py
Normal file
87
iafd/src/config.py
Normal file
@ -0,0 +1,87 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
import time
|
||||
from datetime import datetime
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from collections import defaultdict
|
||||
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
|
||||
# 统计日志频率
|
||||
log_count = defaultdict(int) # 记录日志的次数
|
||||
last_log_time = defaultdict(float) # 记录上次写入的时间戳
|
||||
|
||||
class RateLimitFilter(logging.Filter):
|
||||
"""
|
||||
频率限制过滤器:
|
||||
1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
|
||||
2. 如果日志速率超过 100 条/秒,发出告警
|
||||
"""
|
||||
LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
|
||||
|
||||
def filter(self, record):
|
||||
global log_count, last_log_time
|
||||
message_key = record.getMessage() # 获取日志内容
|
||||
|
||||
# 计算当前时间
|
||||
now = time.time()
|
||||
elapsed = now - last_log_time[message_key]
|
||||
|
||||
# 限制相同日志的写入频率
|
||||
if elapsed < 60: # 60 秒内
|
||||
log_count[message_key] += 1
|
||||
if log_count[message_key] > self.LOG_LIMIT:
|
||||
print('reach limit.')
|
||||
return False # 直接丢弃
|
||||
else:
|
||||
log_count[message_key] = 1 # 超过 60 秒,重新计数
|
||||
|
||||
last_log_time[message_key] = now
|
||||
|
||||
return True # 允许写入日志
|
||||
|
||||
|
||||
|
||||
def setup_logging(log_filename=None):
|
||||
if log_filename is None:
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
log_filename = f'../log/{caller_filename}_{current_date}.log'
|
||||
|
||||
max_log_size = 100 * 1024 * 1024 # 10 MB
|
||||
max_log_files = 10 # 最多保留 10 个日志文件
|
||||
|
||||
file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
))
|
||||
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
))
|
||||
|
||||
# 创建 logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.handlers = [] # 避免重复添加 handler
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# 添加频率限制
|
||||
rate_limit_filter = RateLimitFilter()
|
||||
file_handler.addFilter(rate_limit_filter)
|
||||
console_handler.addFilter(rate_limit_filter)
|
||||
|
||||
|
||||
# 运行示例
|
||||
if __name__ == "__main__":
|
||||
setup_logging()
|
||||
|
||||
for i in range(1000):
|
||||
logging.info("测试日志,检测频率限制")
|
||||
time.sleep(0.01) # 模拟快速写入日志
|
||||
411
iafd/src/fetch.py
Normal file
411
iafd/src/fetch.py
Normal file
@ -0,0 +1,411 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as db_tools
|
||||
import iafd_scraper as scraper
|
||||
import utils
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
debug = False
|
||||
force = False
|
||||
|
||||
# 按星座获取演员列表,无翻页
|
||||
def fetch_performers_by_astro():
|
||||
for astro in scraper.astro_list:
|
||||
url = scraper.astr_base_url + astro
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
break
|
||||
|
||||
|
||||
# 按生日获取演员列表,无翻页
|
||||
def fetch_performers_by_birth():
|
||||
for month in range(1, 13): # 遍历1到12月
|
||||
for day in range(1, 32): # 遍历1到31天
|
||||
url = scraper.birth_base_url.format(month=month, day=day)
|
||||
logging.info(f"Fetching data for birth, url {url}")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 更新人种列表
|
||||
def fetch_ethic_list():
|
||||
url = scraper.ethnic_list_url
|
||||
logging.info(f"Fetching data for performer's ethnic list, url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="ethnicity1", attr_type="id"))
|
||||
if soup:
|
||||
list_data = scraper.parse_page_ethnic_list(soup, url)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
dist_id = db_tools.insert_or_update_ethnic({'name': row['name'], 'href': row.get('href', '')})
|
||||
if dist_id:
|
||||
logging.debug(f'insert one record into ethnic table. id:{dist_id}, name: {row['name']}, href:{row.get('href', '')}')
|
||||
else:
|
||||
logging.warning(f'fetch ethnic error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch page error. {url} ...')
|
||||
|
||||
|
||||
# 按人种获取演员列表,有翻页
|
||||
def fetch_performers_by_ethnic():
|
||||
# 先刷新列表
|
||||
fetch_ethic_list()
|
||||
|
||||
ethnic_list = db_tools.query_ethnic_hrefs()
|
||||
for row in ethnic_list:
|
||||
url = row['href']
|
||||
ethnic = row['name']
|
||||
next_url = url
|
||||
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for {ethnic}, url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
||||
parser="lxml", preprocessor=scraper.preprocess_html)
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 获取distributors列表
|
||||
def fetch_distributors_list():
|
||||
url = scraper.distributors_list_url
|
||||
logging.info(f"Fetching data for distributors list, url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
dis_url = scraper.distributors_base_url + row['href']
|
||||
dist_id = db_tools.insert_or_update_distributor({'name': row['name'], 'href': dis_url})
|
||||
if dist_id:
|
||||
logging.debug(f'insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 获取studios列表
|
||||
def fetch_studios_list():
|
||||
url = scraper.studios_list_url
|
||||
logging.info(f"Fetching data for studios list, url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
stu_url = scraper.studios_base_url + row['href']
|
||||
stu_id = db_tools.insert_or_update_studio({'name': row['name'], 'href': stu_url})
|
||||
if stu_id:
|
||||
logging.debug(f'insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
|
||||
# 更新distributors列表中的影片信息
|
||||
def fetch_movies_by_dist():
|
||||
# 先刷新一下列表
|
||||
fetch_distributors_list()
|
||||
|
||||
url_list = db_tools.query_studio_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
|
||||
for url in url_list:
|
||||
logging.info(f"Fetching data for distributor url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetching page error. {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新distributors列表中的影片信息
|
||||
def fetch_movies_by_stu():
|
||||
# 先刷新一下列表
|
||||
fetch_studios_list()
|
||||
|
||||
url_list = db_tools.query_studio_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_studio_hrefs(name='vixen.com')
|
||||
for url in url_list:
|
||||
logging.info(f"Fetching data for studio url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetching page error. {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新演员信息,单次循环
|
||||
def fetch_performers_detail_once(perfomers_list):
|
||||
last_performer_id = 0
|
||||
for performer in perfomers_list:
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
logging.info(f"Fetching data for performer ({person}), url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
if soup:
|
||||
data = scraper.parse_page_performer(soup)
|
||||
if data:
|
||||
performer_id = db_tools.insert_or_update_performer({
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
if performer_id:
|
||||
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||
last_performer_id = performer_id
|
||||
else:
|
||||
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
utils.write_person_json(person, url, {
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
|
||||
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
time.sleep(1)
|
||||
return last_performer_id
|
||||
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
limit_count = 5 if debug else 100
|
||||
perfomers_list = []
|
||||
|
||||
# 获取新演员的列表
|
||||
while True:
|
||||
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
|
||||
if len(perfomers_list) < 1:
|
||||
logging.info(f'all new performers fetched. ')
|
||||
break
|
||||
last_perfomer_id = fetch_performers_detail_once(perfomers_list)
|
||||
logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 获取待更新的演员的列表
|
||||
while True:
|
||||
perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
|
||||
if len(perfomers_list) < 1:
|
||||
logging.info(f'all existed performers updated. ')
|
||||
break
|
||||
last_perfomer_id = fetch_performers_detail_once(perfomers_list)
|
||||
logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新影片信息
|
||||
def fetch_movies_detail():
|
||||
limit_count = 10 if debug else 100
|
||||
movies_list = []
|
||||
while True:
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||
if len(movies_list) < 1:
|
||||
logging.info(f'all movies fetched.')
|
||||
break
|
||||
last_movie_id = 0
|
||||
succ_count = 0
|
||||
for movie in movies_list:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
if movie_data :
|
||||
# 修复url不规范的问题
|
||||
if movie_data['DistributorHref']:
|
||||
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
|
||||
if movie_data['StudioHref']:
|
||||
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
|
||||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||
if movie_id:
|
||||
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||
last_movie_id = movie_id
|
||||
succ_count += 1
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
utils.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
# 标记为已处理
|
||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
|
||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
time.sleep(1)
|
||||
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"astro": fetch_performers_by_astro,
|
||||
"birth": fetch_performers_by_birth,
|
||||
"ethnic": fetch_performers_by_ethnic,
|
||||
"dist" : fetch_movies_by_dist,
|
||||
"stu" : fetch_movies_by_stu,
|
||||
"performers": fetch_performers_detail,
|
||||
"movies" : fetch_movies_detail,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
def main(cmd, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
|
||||
|
||||
# 执行指定的函数
|
||||
if cmd:
|
||||
function_names = args.cmd.split(",") # 拆分输入
|
||||
for short_name in function_names:
|
||||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
print(f"Warning: {short_name} is not a valid function shortcut.")
|
||||
else: # 全量执行
|
||||
for name, func in function_map.items():
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
print(f"Warning: {name} is not a valid function shortcut.")
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1, movies 更新之后,要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取
|
||||
# 2, distributors 和 studios 对movie列表的互相检验
|
||||
# 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch iafd data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.cmd, args.debug, args.force)
|
||||
562
iafd/src/iafd_scraper.py
Normal file
562
iafd/src/iafd_scraper.py
Normal file
@ -0,0 +1,562 @@
|
||||
|
||||
import cloudscraper
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
from functools import partial
|
||||
import config
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
|
||||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||
|
||||
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||
|
||||
distributors_list_url = f'{host_url}/distrib.asp'
|
||||
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||||
|
||||
studios_list_url = f"{host_url}/studio.asp"
|
||||
studios_base_url = f"{host_url}/studio.rme/studio="
|
||||
|
||||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if host_url not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None, None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.debug(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 过期的网页,与404相同处理
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
logging.debug(f"invalid or outdated page: {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
if attr_type == "id":
|
||||
return soup.find(tag, id=identifier) is not None
|
||||
elif attr_type == "class":
|
||||
return bool(soup.find_all(tag, class_=identifier))
|
||||
elif attr_type == "name":
|
||||
return bool(soup.find('select', {'name': identifier}))
|
||||
return False
|
||||
|
||||
# 检查电影信息是否存在
|
||||
def movie_validator(soup, table_id):
|
||||
return soup.find("table", id=table_id) is not None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_ethnic_list(soup, href):
|
||||
div_root = soup.find("select", id="ethnicity1")
|
||||
if not div_root:
|
||||
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
|
||||
return None, None
|
||||
|
||||
list_data = []
|
||||
|
||||
# 提取所有的 <option> 标签
|
||||
options = div_root.find_all('option')
|
||||
if options:
|
||||
# 解析并输出 value 和文本内容
|
||||
for option in options:
|
||||
href = option.get('value', None)
|
||||
text = option.text.strip()
|
||||
if href and href.lower() == 'none':
|
||||
continue
|
||||
list_data.append({
|
||||
"name": text,
|
||||
"href": host_url + href if href else ''
|
||||
})
|
||||
return list_data
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_astro(soup, astro):
|
||||
astro_div = soup.find("div", id="astro")
|
||||
if not astro_div:
|
||||
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||||
return None, None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
birth_date = None
|
||||
for elem in astro_div.find_all(recursive=False):
|
||||
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||||
birth_date = elem.get_text(strip=True)
|
||||
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||||
a_tag = elem.find("a")
|
||||
if a_tag:
|
||||
href = host_url + a_tag["href"]
|
||||
name = a_tag.find("span", class_="perfname")
|
||||
if name:
|
||||
list_data.append({
|
||||
"astrology": astro,
|
||||
"birth_date": birth_date,
|
||||
"person": name.get_text(strip=True),
|
||||
"href": href
|
||||
})
|
||||
flag = True
|
||||
list_cnt = list_cnt +1
|
||||
if flag:
|
||||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
# 解析页面内容并更新birth_map
|
||||
def parse_page_birth(soup, month, day):
|
||||
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||||
if not datarows:
|
||||
return None, None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
list_data = []
|
||||
next_url = None
|
||||
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||||
for row in rows:
|
||||
link_tag = row.find('a')
|
||||
person = link_tag.text.strip() if link_tag else ''
|
||||
href = link_tag['href'] if link_tag else ''
|
||||
href = host_url + href
|
||||
|
||||
# 如果 href 已经在 birth_map 中,跳过
|
||||
flag = True
|
||||
if any(entry['href'] == href for entry in list_data):
|
||||
continue
|
||||
|
||||
# 将数据添加到 birth_map
|
||||
list_data.append({
|
||||
'month': month,
|
||||
'day': day,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
list_cnt = list_cnt +1
|
||||
|
||||
if flag:
|
||||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_ethnic(soup, ethnic):
|
||||
rows = soup.find_all('div', class_='row headshotrow')
|
||||
flag = False
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
for row in rows:
|
||||
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||||
link_tag = col.find('a')
|
||||
img_tag = col.find('div', class_='pictag')
|
||||
flag = True
|
||||
|
||||
if link_tag and img_tag:
|
||||
href = host_url + link_tag['href']
|
||||
person = img_tag.text.strip()
|
||||
|
||||
# 将数据存储到 ethnic_map
|
||||
list_data.append({
|
||||
'ethnic': ethnic,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
if flag:
|
||||
logging.debug(f"get {len(list_data)} persons from this page.")
|
||||
|
||||
next_page = soup.find('a', rel='next')
|
||||
if next_page:
|
||||
next_url = host_url + next_page['href']
|
||||
logging.debug(f"Found next page: {next_url}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
logging.debug(f"All pages fetched for {ethnic}.")
|
||||
return list_data, None
|
||||
else:
|
||||
return None, None
|
||||
|
||||
# 解析列表页
|
||||
def parse_page_dist_stu_list(soup, select_name):
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
select_element = soup.find('select', {'name': select_name})
|
||||
if select_element :
|
||||
options = select_element.find_all('option')
|
||||
for option in options:
|
||||
value = option.get('value') # 获取 value 属性
|
||||
text = option.text.strip() # 获取文本内容
|
||||
list_data.append({
|
||||
'name' : text,
|
||||
'href' : str(value)
|
||||
})
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_dist_stu(soup, table_id):
|
||||
table = soup.find("table", id=table_id)
|
||||
if not table:
|
||||
logging.warning(f"Warning: No {table_id} table found ")
|
||||
return None, None
|
||||
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 5:
|
||||
title = cols[0].text.strip()
|
||||
label = cols[1].text.strip()
|
||||
year = cols[2].text.strip()
|
||||
rev = cols[3].text.strip()
|
||||
a_href = cols[0].find('a')
|
||||
href = host_url + a_href['href'] if a_href else ''
|
||||
|
||||
list_data.append({
|
||||
'title': title,
|
||||
'label': label,
|
||||
'year': year,
|
||||
'rev': rev,
|
||||
'href': href
|
||||
})
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 作品列表,有个人出演,也有导演的
|
||||
def parse_credits_table(table, distributor_list):
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
movies = []
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
|
||||
# rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
#tr_class = row.get('class', '') # 获取 class 属性,如果没有则返回空字符串
|
||||
tr_class = ' '.join(row.get('class', [])) # 获取 class 属性,如果没有则返回空字符串
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
href_a = cols[0].find('a')
|
||||
href = href_a['href'] if href_a else ''
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
href_d = cols[2].find('a')
|
||||
href_dist = host_url + href_d['href'] if href_d else ''
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
|
||||
for key in distributor_list:
|
||||
if key in distributor:
|
||||
distributor_count[key] += 1
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'href' : href,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'distributor_href': href_dist,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats,
|
||||
'tr_class': tr_class
|
||||
})
|
||||
return movies, distributor_count
|
||||
|
||||
|
||||
# 请求网页并提取所需数据
|
||||
def parse_page_performer(soup):
|
||||
# 提取数据
|
||||
data = {}
|
||||
|
||||
# 定义我们需要的字段名称和HTML中对应的标签
|
||||
fields = {
|
||||
'performer_aka': 'Performer AKA',
|
||||
'birthday': 'Birthday',
|
||||
'astrology': 'Astrology',
|
||||
'birthplace': 'Birthplace',
|
||||
'gender': 'Gender',
|
||||
'years_active': 'Years Active',
|
||||
'ethnicity': 'Ethnicity',
|
||||
'nationality': 'Nationality',
|
||||
'hair_colors': 'Hair Colors',
|
||||
'eye_color': 'Eye Color',
|
||||
'height': 'Height',
|
||||
'weight': 'Weight',
|
||||
'measurements': 'Measurements',
|
||||
'tattoos': 'Tattoos',
|
||||
'piercings': 'Piercings'
|
||||
}
|
||||
reversed_map = {v: k for k, v in fields.items()}
|
||||
|
||||
# 解析表格数据, 获取参演或者导演的列表
|
||||
role_list = ['personal', 'directoral']
|
||||
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||
credits_list = {}
|
||||
|
||||
# 使用字典来存储统计
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
for role in role_list:
|
||||
table = soup.find('table', id=role)
|
||||
if table :
|
||||
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||
credits_list[role] = movies
|
||||
# 更新 distributor 统计
|
||||
for distributor in distributor_list:
|
||||
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||
|
||||
# 统计 movies 数量
|
||||
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||
|
||||
# 如果没有找到
|
||||
if len(credits_list) == 0 :
|
||||
logging.warning(f"movie table empty. url: {url} ")
|
||||
|
||||
# 遍历每个 bioheading, 获取metadata
|
||||
bioheadings = soup.find_all('p', class_='bioheading')
|
||||
for bio in bioheadings:
|
||||
heading = bio.text.strip()
|
||||
biodata = None
|
||||
|
||||
# 如果包含 "Performer",需要特殊处理
|
||||
if 'Performer' in heading:
|
||||
heading = 'Performer AKA'
|
||||
biodata_div = bio.find_next('div', class_='biodata')
|
||||
if biodata_div:
|
||||
div_text = biodata_div.get_text(separator='|').strip()
|
||||
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||||
else:
|
||||
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||||
|
||||
# 保存数据
|
||||
if heading in reversed_map:
|
||||
kkey = reversed_map[heading]
|
||||
data[kkey] = biodata
|
||||
|
||||
# 添加统计数据到 data
|
||||
data['movies_cnt'] = movies_cnt
|
||||
data['vixen_cnt'] = distributor_count['vixen']
|
||||
data['blacked_cnt'] = distributor_count['blacked']
|
||||
data['tushy_cnt'] = distributor_count['tushy']
|
||||
data['x_art_cnt'] = distributor_count['x-art']
|
||||
data['credits'] = credits_list
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_page_movie(soup, href, title):
|
||||
# 解析电影基础信息
|
||||
movie_data = {}
|
||||
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||
if info_div:
|
||||
labels = info_div.find_all("p", class_="bioheading")
|
||||
values = info_div.find_all("p", class_="biodata")
|
||||
for label, value in zip(labels, values):
|
||||
key = label.text.strip()
|
||||
val = value.text.strip()
|
||||
if key in ["Distributor", "Studio", "Director"]:
|
||||
link = value.find("a")
|
||||
if link:
|
||||
val = link.text.strip()
|
||||
movie_data[f'{key}Href'] = host_url + link['href']
|
||||
movie_data[key] = val
|
||||
else:
|
||||
return None
|
||||
|
||||
# 解析演职人员信息
|
||||
performers = []
|
||||
cast_divs = soup.find_all("div", class_="castbox")
|
||||
for cast in cast_divs:
|
||||
performer = {}
|
||||
link = cast.find("a")
|
||||
if link:
|
||||
performer["name"] = link.text.strip()
|
||||
performer["href"] = host_url + link["href"]
|
||||
|
||||
performer["tags"] = [
|
||||
tag.strip() for br in cast.find_all("br")
|
||||
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||
]
|
||||
|
||||
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||
performers.append(performer)
|
||||
|
||||
# 解析场景拆解
|
||||
scene_breakdowns = []
|
||||
scene_table = soup.find("div", id="sceneinfo")
|
||||
if scene_table:
|
||||
rows = scene_table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) >= 2:
|
||||
scene = cols[0].text.strip() # 场景编号
|
||||
performer_info = cols[1] # 包含表演者及链接信息
|
||||
|
||||
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||
performer_html = str(performer_info) # 获取所有HTML内容
|
||||
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||
|
||||
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||
performers_text = performers_soup.get_text()
|
||||
|
||||
# 提取表演者
|
||||
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||
|
||||
# 尝试获取 `webscene` 和 `studio`
|
||||
links_data = {}
|
||||
links = performer_info.find_all("a")
|
||||
if links:
|
||||
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||
webscene = links[0]["href"] if len(links)>0 else None
|
||||
studio = links[1].text.strip() if len(links)>1 else None
|
||||
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||
links_data = {
|
||||
"title": webscene_title,
|
||||
"webscene": webscene,
|
||||
"studio": studio,
|
||||
"studio_lnk": studio_lnk,
|
||||
}
|
||||
|
||||
scene_data = {
|
||||
"scene": scene,
|
||||
"performers": scene_performers,
|
||||
**links_data,
|
||||
}
|
||||
scene_breakdowns.append(scene_data)
|
||||
|
||||
appears_in = []
|
||||
appears_divs = soup.find("div", id="appearssection")
|
||||
if appears_divs:
|
||||
rows = appears_divs.find_all("li")
|
||||
for row in rows:
|
||||
lnk = row.find("a")
|
||||
if lnk:
|
||||
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||
|
||||
|
||||
return {
|
||||
"href": href,
|
||||
"title": title,
|
||||
"Minutes": movie_data.get("Minutes", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||
"All-Girl": movie_data.get("All-Girl", ""),
|
||||
"All-Male": movie_data.get("All-Male", ""),
|
||||
"Compilation": movie_data.get("Compilation", ""),
|
||||
"Webscene": movie_data.get("Webscene", ""),
|
||||
"Director": movie_data.get("Director", ""),
|
||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"StudioHref": movie_data.get("StudioHref", ""),
|
||||
"Performers": performers,
|
||||
"SceneBreakdowns": scene_breakdowns,
|
||||
"AppearsIn": appears_in,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
for astro in astro_list:
|
||||
url = astr_base_url + astro
|
||||
next_url = url
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
while True:
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
print(list_data[0] if len(list_data)>0 else 'no data')
|
||||
break
|
||||
else:
|
||||
logging.info(f"Retrying {next_url} ...")
|
||||
time.sleep(5) # 等待后再重试
|
||||
|
||||
time.sleep(2) # 控制访问频率
|
||||
107
iafd/src/load.py
Normal file
107
iafd/src/load.py
Normal file
@ -0,0 +1,107 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as db_tools
|
||||
import iafd_scraper as scraper
|
||||
import utils
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
res_dir = '/root/hostdir/scripts_data/iafd_202503'
|
||||
|
||||
# 演员列表
|
||||
def load_performer_list(file, **from_fields):
|
||||
json_data = utils.read_json(file)
|
||||
if json_data is None:
|
||||
json_data = []
|
||||
|
||||
total_rows = len(json_data)
|
||||
loaded_rows = 0
|
||||
succ = 0
|
||||
for row in json_data:
|
||||
row_id = db_tools.insert_performer_index(name=row.get('person', ''),
|
||||
href=row.get('href', ''),
|
||||
**from_fields
|
||||
)
|
||||
if row_id:
|
||||
logging.debug(f'insert one person, id: {row_id}, person: {row['person']}, url: {row['href']}')
|
||||
succ += 1
|
||||
else:
|
||||
logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
|
||||
loaded_rows += 1
|
||||
if loaded_rows % 10000 == 0:
|
||||
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
|
||||
|
||||
logging.info(f'load data succ. file: {file}, rows: {total_rows}, succ rows: {succ}')
|
||||
|
||||
# movie 列表
|
||||
def load_movie_list(file, **from_fields):
|
||||
json_data = utils.read_json(file)
|
||||
if json_data is None:
|
||||
json_data = []
|
||||
|
||||
total_rows = len(json_data)
|
||||
loaded_rows = 0
|
||||
succ = 0
|
||||
for row in json_data:
|
||||
row_id = db_tools.insert_movie_index(title=row.get('title', ''),
|
||||
href=row.get('href', ''),
|
||||
release_year=utils.to_number(row['year']),
|
||||
**from_fields
|
||||
)
|
||||
if row_id:
|
||||
logging.debug(f'insert one movie, id: {row_id}, title: {row['title']}, url: {row['href']}')
|
||||
succ += 1
|
||||
else:
|
||||
logging.warning(f'insert movie failed: {row['title']}, {row['href']} failed.')
|
||||
loaded_rows += 1
|
||||
if loaded_rows % 10000 == 0:
|
||||
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
|
||||
|
||||
logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
|
||||
|
||||
|
||||
# 演员详情
|
||||
def load_performers(file):
|
||||
json_data = utils.read_json(file)
|
||||
if json_data is None:
|
||||
json_data = []
|
||||
|
||||
total_rows = len(json_data)
|
||||
loaded_rows = 0
|
||||
succ = 0
|
||||
for row in json_data:
|
||||
performer_id = db_tools.insert_or_update_performer(row)
|
||||
if performer_id:
|
||||
logging.debug(f'insert one person, id: {performer_id}, person: {row['person']}, url: {row['href']}')
|
||||
succ += 1
|
||||
else:
|
||||
logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
|
||||
loaded_rows += 1
|
||||
if loaded_rows % 10000 == 0:
|
||||
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
|
||||
|
||||
logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
load_performer_list(f'{res_dir}/astro.json', from_astro_list=1)
|
||||
time.sleep(3)
|
||||
load_performer_list(f'{res_dir}/birth.json', from_birth_list=1)
|
||||
time.sleep(3)
|
||||
load_performer_list(f'{res_dir}/ethnic.json', from_ethnic_list=1)
|
||||
time.sleep(3)
|
||||
|
||||
load_movie_list(f'{res_dir}/distributors.json', from_dist_list=1)
|
||||
time.sleep(3)
|
||||
load_movie_list(f'{res_dir}/studios.json', from_stu_list=1)
|
||||
time.sleep(3)
|
||||
|
||||
load_performers(f'{res_dir}/performers.json')
|
||||
|
||||
848
iafd/src/sqlite_utils.py
Normal file
848
iafd/src/sqlite_utils.py
Normal file
@ -0,0 +1,848 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import config
|
||||
import utils
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# 连接 SQLite 数据库
|
||||
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 获取当前时间
|
||||
def get_current_time():
|
||||
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
# """从指定表中通过 href 查找 id"""
|
||||
def get_id_by_href(table: str, href: str) -> int:
|
||||
if href is None:
|
||||
return None
|
||||
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
# 插入演员索引,来自于列表数据
|
||||
def insert_performer_index(name, href, from_astro_list=None, from_birth_list=None, from_ethnic_list=None, from_movie_list=None):
|
||||
try:
|
||||
# **查询是否已存在该演员**
|
||||
cursor.execute("""
|
||||
SELECT id, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list
|
||||
FROM iafd_performers WHERE href = ?
|
||||
""", (href,))
|
||||
existing_performer = cursor.fetchone()
|
||||
|
||||
if existing_performer: # **如果演员已存在**
|
||||
performer_id, existing_name, existing_astro, existing_birth, existing_ethnic, existing_movie = existing_performer
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
from_astro_list = from_astro_list if from_astro_list is not None else existing_astro
|
||||
from_birth_list = from_birth_list if from_birth_list is not None else existing_birth
|
||||
from_ethnic_list = from_ethnic_list if from_ethnic_list is not None else existing_ethnic
|
||||
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE iafd_performers
|
||||
SET name = ?,
|
||||
from_astro_list = ?,
|
||||
from_birth_list = ?,
|
||||
from_ethnic_list = ?,
|
||||
from_movie_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list, href))
|
||||
else: # **如果演员不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
performer_id = get_id_by_href('iafd_performers', href)
|
||||
if performer_id:
|
||||
logging.debug(f'Inserted/Updated performer index, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
def insert_movie_index(title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
||||
try:
|
||||
# **查询是否已存在该电影**
|
||||
cursor.execute("""
|
||||
SELECT id, title, release_year, from_performer_list, from_dist_list, from_stu_list
|
||||
FROM iafd_movies WHERE href = ?
|
||||
""", (href,))
|
||||
existing_movie = cursor.fetchone()
|
||||
|
||||
if existing_movie: # **如果电影已存在**
|
||||
movie_id, existing_title, existing_year, existing_performer, existing_dist, existing_stu = existing_movie
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
release_year = release_year if release_year != 0 else existing_year
|
||||
from_performer_list = from_performer_list if from_performer_list is not None else existing_performer
|
||||
from_dist_list = from_dist_list if from_dist_list is not None else existing_dist
|
||||
from_stu_list = from_stu_list if from_stu_list is not None else existing_stu
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE iafd_movies
|
||||
SET title = ?,
|
||||
release_year = ?,
|
||||
from_performer_list = ?,
|
||||
from_dist_list = ?,
|
||||
from_stu_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (title, release_year, from_performer_list, from_dist_list, from_stu_list, href))
|
||||
else: # **如果电影不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_movies (title, href, release_year, from_performer_list, from_dist_list, from_stu_list)
|
||||
VALUES (?, ?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, release_year, from_performer_list, from_dist_list, from_stu_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
movie_id = get_id_by_href('iafd_movies', href)
|
||||
if movie_id:
|
||||
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
# 插入演员和电影的关联数据
|
||||
def insert_performer_movie(performer_id, movie_id, role, notes):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes, role=excluded.role
|
||||
""",
|
||||
(performer_id, movie_id, role, notes)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
# 插入电影和电影的关联数据
|
||||
def insert_movie_appears_in(movie_id, appears_in_id, gradation=0, notes=''):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, appears_in_id) DO UPDATE SET notes=excluded.notes, gradation=excluded.gradation
|
||||
""",
|
||||
(movie_id, appears_in_id, gradation, notes)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
#logging.debug(f'insert one movie_appears_in, movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# 插入演员信息
|
||||
def insert_or_update_performer(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
|
||||
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
|
||||
blacked_cnt, tushy_cnt, x_art_cnt, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
gender = excluded.gender,
|
||||
birthday = excluded.birthday,
|
||||
astrology = excluded.astrology,
|
||||
birthplace = excluded.birthplace,
|
||||
years_active = excluded.years_active,
|
||||
ethnicity = excluded.ethnicity,
|
||||
nationality = excluded.nationality,
|
||||
hair_colors = excluded.hair_colors,
|
||||
eye_color = excluded.eye_color,
|
||||
height_str = excluded.height_str,
|
||||
weight_str = excluded.weight_str,
|
||||
measurements = excluded.measurements,
|
||||
tattoos = excluded.tattoos,
|
||||
piercings = excluded.piercings,
|
||||
weight = excluded.weight,
|
||||
height = excluded.height,
|
||||
movies_cnt = excluded.movies_cnt,
|
||||
vixen_cnt = excluded.vixen_cnt,
|
||||
blacked_cnt = excluded.blacked_cnt,
|
||||
tushy_cnt = excluded.tushy_cnt,
|
||||
x_art_cnt = excluded.x_art_cnt,
|
||||
is_full_data = 1,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (
|
||||
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
|
||||
data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
|
||||
data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')),
|
||||
data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
|
||||
))
|
||||
|
||||
# 获取 performer_id
|
||||
performer_id = get_id_by_href('iafd_performers', data["href"])
|
||||
if performer_id is None:
|
||||
return None
|
||||
logging.debug(f'insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}')
|
||||
|
||||
# 插入新的 alias
|
||||
for alias in data.get("performer_aka") or []:
|
||||
if alias.lower() != "no known aliases":
|
||||
cursor.execute("INSERT OR IGNORE INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ", (performer_id, alias))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# 插入影片列表,可能有 personal 和 director 两个身份
|
||||
credits = data.get('credits', {})
|
||||
for role, movies in credits.items():
|
||||
if movies:
|
||||
for movie in movies:
|
||||
movie_id = get_id_by_href('iafd_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
|
||||
if movie_id:
|
||||
tmp_id = insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
||||
if tmp_id :
|
||||
logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}, role: {role}')
|
||||
else:
|
||||
logging.warning(f'insert performer_movie failed. performer_id: {performer_id}, moive href: {movie['href']}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||
def insert_or_update_performer_404(name, href):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
|
||||
VALUES (?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
is_full_data = 1,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (
|
||||
href, name
|
||||
))
|
||||
|
||||
# 获取 performer_id
|
||||
performer_id = get_id_by_href('iafd_performers', href)
|
||||
if performer_id is None:
|
||||
return None
|
||||
logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 按 id 或 href 删除演员
|
||||
def delete_performer(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM iafd_performers WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM iafd_performers WHERE href = ?", (identifier,))
|
||||
else:
|
||||
logging.warning("无效的删除参数")
|
||||
return
|
||||
conn.commit()
|
||||
logging.info(f"成功删除演员: {identifier}")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# 按 id、href 或 name 查询演员信息
|
||||
def query_performer(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM iafd_performers WHERE id = ?", (identifier,))
|
||||
elif "http" in identifier:
|
||||
cursor.execute("SELECT * FROM iafd_performers WHERE href = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM iafd_performers WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
performer = cursor.fetchone()
|
||||
if performer:
|
||||
cursor.execute("SELECT alias FROM iafd_performer_aliases WHERE performer_id = ?", (performer[0],))
|
||||
aliases = [row[0] for row in cursor.fetchall()]
|
||||
result = dict(zip([desc[0] for desc in cursor.description], performer))
|
||||
result["performer_aka"] = aliases
|
||||
return result
|
||||
else:
|
||||
logging.warning(f"未找到演员: {identifier}")
|
||||
return None
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_performer_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href, name FROM iafd_performers WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
if "is_full_data" in filters:
|
||||
sql += " AND is_full_data = ?"
|
||||
params.append(filters["is_full_data"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_ethnic(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_meta_ethnic (name, href)
|
||||
VALUES (?, ?)
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM iafd_meta_ethnic WHERE href = ?", (data["href"],))
|
||||
dist_id = cursor.fetchone()[0]
|
||||
if dist_id:
|
||||
logging.debug(f"成功插入/更新ethnic: {data['name']}")
|
||||
return dist_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_ethnic_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href, name FROM iafd_meta_ethnic WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "url" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_distributor(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_distributors (name, href, updated_at)
|
||||
VALUES (?, ? , datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM iafd_distributors WHERE href = ?", (data["href"],))
|
||||
dist_id = cursor.fetchone()[0]
|
||||
if dist_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return dist_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# 删除发行商(按 id 或 name) """
|
||||
def delete_distributor(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM iafd_distributors WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM iafd_distributors WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除发行商: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# 查询发行商(按 id 或 name) """
|
||||
def query_distributor(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM iafd_distributors WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM iafd_distributors WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
distributor = cursor.fetchone()
|
||||
if distributor:
|
||||
return dict(zip([desc[0] for desc in cursor.description], distributor))
|
||||
else:
|
||||
logging.warning(f"未找到发行商: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_distributor_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM iafd_distributors WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "url" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# """ 插入或更新制作公司 """
|
||||
def insert_or_update_studio(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_studios (name, href, updated_at)
|
||||
VALUES (?, ?, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM iafd_studios WHERE href = ?", (data["href"],))
|
||||
stu_id = cursor.fetchone()[0]
|
||||
if stu_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return stu_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# """ 删除制作公司(按 id 或 name) """
|
||||
def delete_studio(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM iafd_studios WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM iafd_studios WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除制作公司: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# """ 查询制作公司(按 id 或 name) """
|
||||
def query_studio(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM iafd_studios WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM iafd_studios WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
studio = cursor.fetchone()
|
||||
if studio:
|
||||
return dict(zip([desc[0] for desc in cursor.description], studio))
|
||||
else:
|
||||
logging.warning(f"未找到制作公司: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_studio_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM iafd_studios WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# """插入或更新电影数据"""
|
||||
def insert_or_update_movie(movie_data):
|
||||
try:
|
||||
# 获取相关 ID
|
||||
distributor_id = get_id_by_href('iafd_distributors', movie_data['DistributorHref'])
|
||||
studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
|
||||
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
|
||||
# 导演不存在的话,插入一条
|
||||
if director_id is None:
|
||||
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'], from_movie_list=1)
|
||||
if studio_id is None:
|
||||
studio_id = 0
|
||||
if distributor_id is None:
|
||||
distributor_id = 0
|
||||
|
||||
# 插入或更新电影信息
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO iafd_movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
|
||||
all_girl, all_male, compilation, webscene, director_id, href, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
|
||||
studio_id=excluded.studio_id, release_date=excluded.release_date,
|
||||
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
|
||||
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
|
||||
director_id=excluded.director_id, is_full_data=1, updated_at = datetime('now', 'localtime')
|
||||
""",
|
||||
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
|
||||
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
|
||||
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入的 movie_id
|
||||
movie_id = get_id_by_href('iafd_movies', movie_data['href'])
|
||||
if movie_id is None:
|
||||
return None
|
||||
|
||||
logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}')
|
||||
|
||||
# 插入 performers_movies 关系表
|
||||
for performer in movie_data.get('Performers', []):
|
||||
performer_id = get_id_by_href('iafd_performers', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = insert_performer_index(performer['name'], performer['href'], from_movie_list=1)
|
||||
if performer_id:
|
||||
notes = '|'.join(tag for tag in performer['tags'] if tag != performer['name'])
|
||||
tmp_id = insert_performer_movie(performer_id, movie_id, 'personal', notes)
|
||||
if tmp_id:
|
||||
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||
else:
|
||||
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
|
||||
else:
|
||||
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
|
||||
|
||||
# 插入 movies_appers_in 表
|
||||
for appears in movie_data.get("AppearsIn", []):
|
||||
appears_in_id = get_id_by_href('iafd_movies', appears['href'])
|
||||
# 不存在,先插入
|
||||
if appears_in_id is None:
|
||||
appears_in_id = insert_movie_index( appears['title'], appears['href'])
|
||||
if appears_in_id:
|
||||
tmp_id = insert_movie_appears_in(movie_id, appears_in_id)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie_appears_in record. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||
else:
|
||||
logging.warning(f'insert movie_appears_in failed. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||
else:
|
||||
logging.warning(f'get appears_in_id failed. title: {appears['title']}, href: {appears['href']}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# """插入或更新电影数据(异常url的处理,比如404链接)"""
|
||||
def insert_or_update_movie_404(title, href):
|
||||
try:
|
||||
# 插入或更新电影信息
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
|
||||
VALUES (?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
|
||||
""",
|
||||
(title, href)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入的 movie_id
|
||||
movie_id = get_id_by_href('iafd_movies', href)
|
||||
if movie_id is None:
|
||||
return None
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# 删除电影数据"""
|
||||
def delete_movie(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM iafd_movies WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM iafd_movies WHERE href = ?", (identifier,))
|
||||
else:
|
||||
logging.warning("无效的删除参数")
|
||||
return
|
||||
conn.commit()
|
||||
logging.info(f"Deleted movie with {identifier}")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error("Error deleting movie: %s", e)
|
||||
|
||||
# 查找电影数据"""
|
||||
def query_movies(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM iafd_movies WHERE id = ?", (identifier,))
|
||||
elif "http" in identifier:
|
||||
cursor.execute("SELECT * FROM iafd_movies WHERE href = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM iafd_movies WHERE title LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
movie = cursor.fetchone()
|
||||
if movie:
|
||||
cursor.execute("SELECT * FROM iafd_performers_movies WHERE performer_id = ?", (movie[0],))
|
||||
performers = [row[0] for row in cursor.fetchall()]
|
||||
result = dict(zip([desc[0] for desc in cursor.description], performers))
|
||||
result["performers"] = performers
|
||||
return result
|
||||
else:
|
||||
logging.warning(f"find no data: {identifier}")
|
||||
return None
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_movie_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href, title FROM iafd_movies WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "title" in filters:
|
||||
sql += " AND title LIKE ?"
|
||||
params.append(f"%{filters['title']}%")
|
||||
if "is_full_data" in filters:
|
||||
sql += " AND is_full_data = ?"
|
||||
params.append(filters["is_full_data"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return []
|
||||
|
||||
# 获取 view_iafd_performers_movies 中数据 不匹配的演员信息。
|
||||
def get_performers_needed_update(limit=None):
|
||||
try:
|
||||
sql = """
|
||||
SELECT href, name FROM view_iafd_performers_movies where actual_movies_cnt != movies_cnt
|
||||
"""
|
||||
|
||||
if limit is not None:
|
||||
sql += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(sql)
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return []
|
||||
|
||||
# 插入一条任务日志
|
||||
def insert_task_log():
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_task_log (task_status) VALUES ('Start')
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
task_id = cursor.lastrowid
|
||||
if task_id is None:
|
||||
return None
|
||||
update_task_log(task_id=task_id, task_status='Start')
|
||||
|
||||
return task_id # 获取插入的 task_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"插入任务失败: {e}")
|
||||
return None
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log_inner(task_id, **kwargs):
|
||||
try:
|
||||
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
||||
params = list(kwargs.values()) + [task_id]
|
||||
|
||||
sql = f"UPDATE iafd_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
|
||||
cursor.execute(sql, params)
|
||||
conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log(task_id, task_status):
|
||||
try:
|
||||
# 获取 performers、studios 等表的最终行数
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_performers where is_full_data=1")
|
||||
full_data_performers = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_performers")
|
||||
total_performers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_movies where is_full_data=1")
|
||||
full_data_movies = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_movies")
|
||||
total_movies = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
|
||||
total_distributors = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_studios")
|
||||
total_studios = cursor.fetchone()[0]
|
||||
|
||||
# 更新 task_log
|
||||
update_task_log_inner(task_id,
|
||||
full_data_performers=full_data_performers,
|
||||
total_performers=total_performers,
|
||||
full_data_movies=full_data_movies,
|
||||
total_movies=total_movies,
|
||||
total_distributors=total_distributors,
|
||||
total_studios=total_studios,
|
||||
task_status=task_status)
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
|
||||
# 任务结束,更新字段
|
||||
def finalize_task_log(task_id):
|
||||
try:
|
||||
# 更新 task_log
|
||||
update_task_log(task_id, task_status="Success")
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
try:
|
||||
with open('../result/detail.json', 'r') as file:
|
||||
performers = json.load(file)
|
||||
for performer in performers:
|
||||
insert_or_update_performer(performer)
|
||||
|
||||
print(query_performer("Kirsten"))
|
||||
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
|
||||
print(query_performer_hrefs())
|
||||
except FileNotFoundError:
|
||||
logging.info("detail.json not found, starting fresh.")
|
||||
101
iafd/src/utils.py
Normal file
101
iafd/src/utils.py
Normal file
@ -0,0 +1,101 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import logging
|
||||
import config
|
||||
|
||||
# 解析 height 和 weight(转换成数字)
|
||||
def parse_height(height_str):
|
||||
return 0
|
||||
try:
|
||||
return int(height_str.split("(")[-1].replace(" cm)", ""))
|
||||
except:
|
||||
return None
|
||||
|
||||
def parse_weight(weight_str):
|
||||
return 0
|
||||
try:
|
||||
return int(weight_str.split(" ")[0])
|
||||
except:
|
||||
return None
|
||||
|
||||
update_dir = f'{config.global_host_data_dir}/iafd'
|
||||
performers_dir = f'{update_dir}/performers'
|
||||
movies_dir = f'{update_dir}/movies'
|
||||
|
||||
def to_number(value):
|
||||
"""将字符串转换为数字,如果无效则返回 0"""
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
|
||||
def dist_stu_href_rewrite(href):
|
||||
# 提取 ID(适用于 distrib 或 studio)
|
||||
import re
|
||||
match = re.search(r"(distrib|studio)=(\d+)", href)
|
||||
if not match:
|
||||
return None # 不是目标 URL,返回 None
|
||||
|
||||
key, id_number = match.groups()
|
||||
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
|
||||
return new_url
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
sub_dir = str[:1].lower()
|
||||
full_path = os.path.join(base_dir, sub_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_person_json(person, href, data):
|
||||
# 获取目录
|
||||
person_dir = create_sub_directory(performers_dir, person)
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_movie_json(href, data):
|
||||
# 获取目录
|
||||
movie_id = extract_id_from_href(href)
|
||||
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
# 读取json文件并返回内容
|
||||
def read_json(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"文件 {file_path} 未找到.")
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
print(f"文件 {file_path} 解析错误.")
|
||||
return None
|
||||
26
iafd/src_json/config.py
Normal file
26
iafd/src_json/config.py
Normal file
@ -0,0 +1,26 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
from datetime import datetime
|
||||
|
||||
global_share_data_dir = '/root/sharedata'
|
||||
global_host_data_dir = '/root/hostdir/scripts_data'
|
||||
|
||||
# 设置日志配置
|
||||
def setup_logging(log_filename=None):
|
||||
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||
if log_filename is None:
|
||||
# 获取调用 setup_logging 的脚本文件名
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
|
||||
# 获取当前日期,格式为 yyyymmdd
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
# 拼接 log 文件名,将日期加在扩展名前
|
||||
log_filename = f'../log/{caller_filename}_{current_date}.log'
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename),
|
||||
logging.StreamHandler()
|
||||
])
|
||||
334
iafd/src_json/movie_detail_fetch.py
Normal file
334
iafd/src_json/movie_detail_fetch.py
Normal file
@ -0,0 +1,334 @@
|
||||
import os
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
import logging
|
||||
import sys
|
||||
import signal
|
||||
import re
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
|
||||
# 目录和文件路径
|
||||
RESULT_DIR = "../result"
|
||||
OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
|
||||
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
|
||||
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
|
||||
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
|
||||
BATCH_SIZE = 100 # 每100条数据写入文件
|
||||
movies_dir = f'{RESULT_DIR}/movies'
|
||||
|
||||
# 初始化 Cloudflare 绕过工具
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 全量数据
|
||||
all_movies = []
|
||||
|
||||
def load_existing_data():
|
||||
"""加载已处理的数据,支持续传"""
|
||||
if os.path.exists(OUTPUT_JSON):
|
||||
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
|
||||
try:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def save_data():
|
||||
"""保存数据到 JSON 和 CSV 文件"""
|
||||
logging.info("Saving data...")
|
||||
global all_movies
|
||||
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(all_movies, f, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
|
||||
"AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
|
||||
for movie in all_movies:
|
||||
writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
|
||||
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
|
||||
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
|
||||
|
||||
# 请求网页并返回 HTML 内容
|
||||
def fetch_html(href):
|
||||
"""请求网页并返回 HTML 内容"""
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = scraper.get(href, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.warning(f"Error fetching {href}: {e}")
|
||||
time.sleep(2)
|
||||
|
||||
logging.error(f"Failed to fetch {href} after 3 attempts")
|
||||
return None
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_movie_details(html, href, title):
|
||||
"""解析网页 HTML 并提取电影信息"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# 解析电影基础信息
|
||||
movie_data = {}
|
||||
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||
if info_div:
|
||||
labels = info_div.find_all("p", class_="bioheading")
|
||||
values = info_div.find_all("p", class_="biodata")
|
||||
for label, value in zip(labels, values):
|
||||
key = label.text.strip()
|
||||
val = value.text.strip()
|
||||
if key in ["Distributor", "Studio", "Director"]:
|
||||
link = value.find("a")
|
||||
if link:
|
||||
val = link.text.strip()
|
||||
movie_data[f'{key}Href'] = host_url + link['href']
|
||||
movie_data[key] = val
|
||||
else:
|
||||
return None
|
||||
|
||||
# 解析演职人员信息
|
||||
performers = []
|
||||
cast_divs = soup.find_all("div", class_="castbox")
|
||||
for cast in cast_divs:
|
||||
performer = {}
|
||||
link = cast.find("a")
|
||||
if link:
|
||||
performer["name"] = link.text.strip()
|
||||
performer["href"] = host_url + link["href"]
|
||||
|
||||
performer["tags"] = [
|
||||
tag.strip() for br in cast.find_all("br")
|
||||
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||
]
|
||||
|
||||
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||
performers.append(performer)
|
||||
|
||||
# 解析场景拆解
|
||||
scene_breakdowns = []
|
||||
scene_table = soup.find("div", id="sceneinfo")
|
||||
if scene_table:
|
||||
rows = scene_table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) >= 2:
|
||||
scene = cols[0].text.strip() # 场景编号
|
||||
performer_info = cols[1] # 包含表演者及链接信息
|
||||
|
||||
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||
performer_html = str(performer_info) # 获取所有HTML内容
|
||||
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||
|
||||
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||
performers_text = performers_soup.get_text()
|
||||
|
||||
# 提取表演者
|
||||
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||
|
||||
# 尝试获取 `webscene` 和 `studio`
|
||||
links_data = {}
|
||||
links = performer_info.find_all("a")
|
||||
if links:
|
||||
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||
webscene = links[0]["href"] if len(links)>0 else None
|
||||
studio = links[1].text.strip() if len(links)>1 else None
|
||||
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||
links_data = {
|
||||
"title": webscene_title,
|
||||
"webscene": webscene,
|
||||
"studio": studio,
|
||||
"studio_lnk": studio_lnk,
|
||||
}
|
||||
|
||||
scene_data = {
|
||||
"scene": scene,
|
||||
"performers": scene_performers,
|
||||
**links_data,
|
||||
}
|
||||
scene_breakdowns.append(scene_data)
|
||||
|
||||
appears_in = []
|
||||
appears_divs = soup.find("div", id="appearssection")
|
||||
if appears_divs:
|
||||
rows = appears_divs.find_all("li")
|
||||
for row in rows:
|
||||
lnk = row.find("a")
|
||||
if lnk:
|
||||
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||
|
||||
|
||||
return {
|
||||
"href": href,
|
||||
"title": title,
|
||||
"Minutes": movie_data.get("Minutes", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||
"All-Girl": movie_data.get("All-Girl", ""),
|
||||
"All-Male": movie_data.get("All-Male", ""),
|
||||
"Compilation": movie_data.get("Compilation", ""),
|
||||
"Webscene": movie_data.get("Webscene", ""),
|
||||
"Director": movie_data.get("Director", ""),
|
||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"StudioHref": movie_data.get("StudioHref", ""),
|
||||
"Performers": performers,
|
||||
"SceneBreakdowns": scene_breakdowns,
|
||||
"AppearsIn": appears_in,
|
||||
}
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
sub_dir = str[:1].lower()
|
||||
full_path = os.path.join(base_dir, sub_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_movie_json(href, data):
|
||||
# 获取目录
|
||||
movie_id = extract_id_from_href(href)
|
||||
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
def process_movies():
|
||||
"""处理电影数据"""
|
||||
global all_movies
|
||||
all_movies = load_existing_data()
|
||||
processed_hrefs = {movie["href"] for movie in all_movies}
|
||||
|
||||
# 读取 distributors.json 文件
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
movies = json.load(f)
|
||||
|
||||
count = 0
|
||||
|
||||
for entry in movies:
|
||||
href = entry["href"]
|
||||
title = entry["title"]
|
||||
|
||||
if href in processed_hrefs:
|
||||
logging.info(f"Skiping existed: {title} ({href})")
|
||||
continue # 跳过已处理数据
|
||||
|
||||
logging.info(f"Processing: {title} ({href})")
|
||||
|
||||
while True:
|
||||
html = fetch_html(href)
|
||||
if not html:
|
||||
logging.warning(f'Retring {title} ({href}) ')
|
||||
continue # 获取失败,跳过
|
||||
else:
|
||||
movie = parse_movie_details(html, href, title)
|
||||
if not movie:
|
||||
logging.warning(f'Retring {title} ({href}) ')
|
||||
continue
|
||||
else:
|
||||
all_movies.append(movie)
|
||||
count += 1
|
||||
|
||||
# 写入本地文件
|
||||
write_movie_json(href, movie)
|
||||
break
|
||||
|
||||
# 每 BATCH_SIZE 条数据刷新一次文件
|
||||
if count % BATCH_SIZE == 0:
|
||||
save_data()
|
||||
|
||||
# 最终保存文件
|
||||
save_data()
|
||||
|
||||
logging.info("Task completed.")
|
||||
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 指定url访问
|
||||
def process_one(href):
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
# 获取并解析数据
|
||||
movie = {}
|
||||
while True:
|
||||
html = fetch_html(href)
|
||||
if not html:
|
||||
logging.warning(f'fetching {href} error. retrying...')
|
||||
continue # 获取失败,跳过
|
||||
|
||||
movie = parse_movie_details(html, href, 'title')
|
||||
if movie:
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetching {href} error. retrying...')
|
||||
continue # 获取失败,跳过
|
||||
|
||||
if movie:
|
||||
write_movie_json(href, movie)
|
||||
|
||||
print(f'fetch succ. saved result in {movies_dir}')
|
||||
|
||||
# 处理程序被终止时的数据
|
||||
def handle_exit_signal(signal, frame):
|
||||
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
|
||||
save_data()
|
||||
sys.exit(0)
|
||||
|
||||
# 全量访问
|
||||
def main():
|
||||
try:
|
||||
# 注册退出信号
|
||||
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
|
||||
process_movies()
|
||||
finally:
|
||||
# 清理操作,保证在程序正常退出时执行
|
||||
save_data()
|
||||
logging.info("Data processing completed.")
|
||||
|
||||
# 程序入口,读取参数
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
url = sys.argv[1]
|
||||
process_one(url)
|
||||
else:
|
||||
main()
|
||||
255
iafd/src_json/movie_list_fetch.py
Normal file
255
iafd/src_json/movie_list_fetch.py
Normal file
@ -0,0 +1,255 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import cloudscraper
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
# 结果路径
|
||||
res_dir = f"{config.global_share_data_dir}/iafd"
|
||||
|
||||
fetch_config = {
|
||||
'dist': {
|
||||
'base_url': f"{host_url}/distrib.rme/distrib=",
|
||||
'list_page_url': f"{host_url}/distrib.asp",
|
||||
'html_table_id': 'distable',
|
||||
'html_select_name': 'Distrib',
|
||||
'output_key_id': 'distributors',
|
||||
'json_file': f'{res_dir}/distributors.json',
|
||||
'csv_file': f'{res_dir}/distributors.csv',
|
||||
},
|
||||
'stu': {
|
||||
'base_url': f"{host_url}/studio.rme/studio=",
|
||||
'list_page_url': f"{host_url}/studio.asp",
|
||||
'html_table_id': 'studio',
|
||||
'html_select_name': 'Studio',
|
||||
'output_key_id': 'studios',
|
||||
'json_file': f'{res_dir}/studios.json',
|
||||
'csv_file': f'{res_dir}/studios.csv',
|
||||
}
|
||||
}
|
||||
|
||||
distr_map = {
|
||||
6812 : 'nubilefilms.com',
|
||||
8563 : 'teenmegaworld network',
|
||||
6779 : 'x-art.com',
|
||||
7133 : 'tushy.com',
|
||||
6496 : 'blacked.com',
|
||||
7758 : 'vixen.com',
|
||||
6791 : 'teamskeet.com',
|
||||
12454: 'vip4k.com',
|
||||
13541: 'wow network',
|
||||
9702 : 'cum4k.com',
|
||||
6778 : 'tiny4k.com',
|
||||
12667: 'anal4k.com',
|
||||
7419 : 'exotic4k.com',
|
||||
13594: 'facials4k.com',
|
||||
13633: 'mom4k.com',
|
||||
12335: 'slim4k.com',
|
||||
16709: 'strippers4k.com',
|
||||
|
||||
}
|
||||
studio_map = {
|
||||
6812 : 'nubilefilms.com',
|
||||
9811 : 'Teen Mega World',
|
||||
6779 : 'x-art.com',
|
||||
7133 : 'tushy.com',
|
||||
6496 : 'blacked.com',
|
||||
7758 : 'vixen.com',
|
||||
6791 : 'teamskeet.com',
|
||||
8052: 'wowgirls.com',
|
||||
9702 : 'cum4k.com',
|
||||
6778 : 'tiny4k.com',
|
||||
12667: 'anal4k.com',
|
||||
7419 : 'exotic4k.com',
|
||||
13594: 'facials4k.com',
|
||||
13633: 'mom4k.com',
|
||||
12335: 'slim4k.com',
|
||||
16709: 'strippers4k.com',
|
||||
|
||||
}
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
all_data = []
|
||||
|
||||
# 网络请求并解析 HTML
|
||||
def fetch_page(url):
|
||||
try:
|
||||
response = scraper.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to fetch {url}: {e}")
|
||||
return None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page(html, name, config):
|
||||
table_id = config['html_table_id']
|
||||
key_id = config['output_key_id']
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
table = soup.find("table", id=table_id)
|
||||
|
||||
if not table:
|
||||
logging.warning(f"Warning: No {table_id} table found in {name}")
|
||||
return None
|
||||
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
global all_data
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 5:
|
||||
title = cols[0].text.strip()
|
||||
label = cols[1].text.strip()
|
||||
year = cols[2].text.strip()
|
||||
rev = cols[3].text.strip()
|
||||
a_href = cols[0].find('a')
|
||||
href = host_url + a_href['href'] if a_href else ''
|
||||
|
||||
all_data.append({
|
||||
key_id: name,
|
||||
'title': title,
|
||||
'label': label,
|
||||
'year': year,
|
||||
'rev': rev,
|
||||
'href': href
|
||||
})
|
||||
return soup
|
||||
|
||||
# 处理翻页,星座的无需翻页
|
||||
def handle_pagination(soup, astro):
|
||||
return None
|
||||
|
||||
# 获取列表页
|
||||
def process_list_gage(config):
|
||||
list_page_url=config['list_page_url']
|
||||
select_name = config['html_select_name']
|
||||
list_map = {}
|
||||
|
||||
logging.info(f"Fetching data for {list_page_url} ...")
|
||||
select_element = None
|
||||
while True:
|
||||
html = fetch_page(list_page_url)
|
||||
if html:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
select_element = soup.find('select', {'name': select_name})
|
||||
if select_element :
|
||||
break
|
||||
else:
|
||||
logging.info(f"wrong html content. retring {list_page_url} ...")
|
||||
else:
|
||||
logging.info(f"wrong html content. retring {list_page_url} ...")
|
||||
|
||||
if not select_element:
|
||||
return None
|
||||
|
||||
options = select_element.find_all('option')
|
||||
for option in options:
|
||||
value = option.get('value') # 获取 value 属性
|
||||
text = option.text.strip() # 获取文本内容
|
||||
list_map[int(value)] = text
|
||||
logging.info(f'fetch {list_page_url} succ. total lines: {len(list_map)}')
|
||||
return list_map
|
||||
|
||||
# 主逻辑函数:循环处理每个种族
|
||||
def process_main_data(list_data, config):
|
||||
base_url = config['base_url']
|
||||
|
||||
for key, name in list_data.items():
|
||||
url = base_url + str(key)
|
||||
next_url = url
|
||||
logging.info(f"Fetching data for {name}, url {url} ...")
|
||||
|
||||
while next_url:
|
||||
html = fetch_page(next_url)
|
||||
if html:
|
||||
soup = parse_page(html, name, config)
|
||||
if soup:
|
||||
next_url = handle_pagination(soup, name)
|
||||
else:
|
||||
logging.info(f"wrong html content. retring {next_url} ...")
|
||||
# 定期保存结果
|
||||
save_data(config)
|
||||
time.sleep(2) # 控制访问频率
|
||||
else:
|
||||
logging.info(f"Retrying {next_url} ...")
|
||||
time.sleep(5) # 等待后再重试
|
||||
|
||||
# 保存到文件
|
||||
def save_data(config):
|
||||
with open(config['json_file'], 'w', encoding='utf-8') as json_file:
|
||||
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(config['csv_file'], 'w', newline='', encoding='utf-8') as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=[config['output_key_id'], 'title', 'label', 'year', 'rev', 'href'])
|
||||
writer.writeheader()
|
||||
writer.writerows(all_data)
|
||||
|
||||
|
||||
# 执行主逻辑
|
||||
if __name__ == '__main__':
|
||||
# 命令行参数处理
|
||||
parser = argparse.ArgumentParser(description='fetch movie list from iafd.com')
|
||||
parser.add_argument('--type', type=str, default='dist', help='fetch by ... (dist , stu)')
|
||||
parser.add_argument('--kind', type=str, default='parts', help='fetch all or parts (parts , all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
config = fetch_config[args.type]
|
||||
if not config:
|
||||
logging.warning(f'unkwon type: {args.type} {args.kind}')
|
||||
else:
|
||||
list_data = {}
|
||||
if args.kind == 'all':
|
||||
list_data = process_list_gage(config)
|
||||
elif args.type == 'dist':
|
||||
list_data = distr_map
|
||||
else:
|
||||
list_data = studio_map
|
||||
|
||||
process_main_data(list_data, config)
|
||||
logging.info("Data fetching and saving completed.")
|
||||
393
iafd/src_json/performers_details.py
Normal file
393
iafd/src_json/performers_details.py
Normal file
@ -0,0 +1,393 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import cloudscraper
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
import config
|
||||
|
||||
# 配置日志
|
||||
config.setup_logging()
|
||||
|
||||
# 结果路径
|
||||
res_dir = '../result'
|
||||
res_json_file = f'{res_dir}/detail.json'
|
||||
res_csv_file = f'{res_dir}/detail.csv'
|
||||
input_json_file = f'{res_dir}/merged.json'
|
||||
performers_dir = f'{res_dir}/performers'
|
||||
|
||||
# 存储结果
|
||||
final_data = []
|
||||
|
||||
# 读取 detail.json 中的 数据,以便于断点续传
|
||||
def load_existing_hrefs():
|
||||
existing_hrefs = set()
|
||||
global final_data
|
||||
try:
|
||||
with open(res_json_file, 'r') as file:
|
||||
final_data = json.load(file)
|
||||
for entry in final_data:
|
||||
existing_hrefs.add(entry['href'])
|
||||
except FileNotFoundError:
|
||||
logging.info("detail.json not found, starting fresh.")
|
||||
return existing_hrefs
|
||||
|
||||
# 解析 作品列表,有个人出演,也有导演的
|
||||
def parse_credits_table(table, distributor_list):
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
movies = []
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
|
||||
# rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
|
||||
for key in distributor_list:
|
||||
if key in distributor:
|
||||
distributor_count[key] += 1
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats
|
||||
})
|
||||
return movies, distributor_count
|
||||
|
||||
|
||||
# 请求网页并提取所需数据
|
||||
def fetch_and_parse_page(url, scraper):
|
||||
try:
|
||||
response = scraper.get(url)
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
|
||||
return None, None
|
||||
|
||||
# 解析 HTML 内容
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# 提取数据
|
||||
data = {}
|
||||
|
||||
# 定义我们需要的字段名称和HTML中对应的标签
|
||||
fields = {
|
||||
'performer_aka': 'Performer AKA',
|
||||
'birthday': 'Birthday',
|
||||
'astrology': 'Astrology',
|
||||
'birthplace': 'Birthplace',
|
||||
'gender': 'Gender',
|
||||
'years_active': 'Years Active',
|
||||
'ethnicity': 'Ethnicity',
|
||||
'nationality': 'Nationality',
|
||||
'hair_colors': 'Hair Colors',
|
||||
'eye_color': 'Eye Color',
|
||||
'height': 'Height',
|
||||
'weight': 'Weight',
|
||||
'measurements': 'Measurements',
|
||||
'tattoos': 'Tattoos',
|
||||
'piercings': 'Piercings'
|
||||
}
|
||||
reversed_map = {v: k for k, v in fields.items()}
|
||||
|
||||
# 解析表格数据, 获取参演或者导演的列表
|
||||
role_list = ['personal', 'directoral']
|
||||
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||
credits_list = {}
|
||||
|
||||
# 使用字典来存储统计
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
for role in role_list:
|
||||
table = soup.find('table', id=role)
|
||||
if table :
|
||||
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||
credits_list[role] = movies
|
||||
# 更新 distributor 统计
|
||||
for distributor in distributor_list:
|
||||
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||
|
||||
# 统计 movies 数量
|
||||
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||
|
||||
# 如果没有找到
|
||||
if len(credits_list) == 0 :
|
||||
logging.warning(f"movie table empty. url: {url} ")
|
||||
|
||||
# 遍历每个 bioheading, 获取metadata
|
||||
bioheadings = soup.find_all('p', class_='bioheading')
|
||||
for bio in bioheadings:
|
||||
heading = bio.text.strip()
|
||||
biodata = None
|
||||
|
||||
# 如果包含 "Performer",需要特殊处理
|
||||
if 'Performer' in heading:
|
||||
heading = 'Performer AKA'
|
||||
biodata_div = bio.find_next('div', class_='biodata')
|
||||
if biodata_div:
|
||||
div_text = biodata_div.get_text(separator='|').strip()
|
||||
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||||
else:
|
||||
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||||
|
||||
# 保存数据
|
||||
if heading in reversed_map:
|
||||
kkey = reversed_map[heading]
|
||||
data[kkey] = biodata
|
||||
|
||||
# 添加统计数据到 data
|
||||
data['movies_cnt'] = movies_cnt
|
||||
data['vixen_cnt'] = distributor_count['vixen']
|
||||
data['blacked_cnt'] = distributor_count['blacked']
|
||||
data['tushy_cnt'] = distributor_count['tushy']
|
||||
data['x_art_cnt'] = distributor_count['x-art']
|
||||
|
||||
return data, credits_list
|
||||
except RequestException as e:
|
||||
logging.error(f"Error fetching {url}: {e}")
|
||||
return None, None
|
||||
|
||||
# 写入 detail.json
|
||||
def write_to_detail_json(data):
|
||||
with open(res_json_file, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
|
||||
# 写入 CSV 文件
|
||||
def write_to_csv(data):
|
||||
try:
|
||||
with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile, delimiter=',')
|
||||
header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity',
|
||||
'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings',
|
||||
'movies_cnt', 'vixen_cnt', 'blacked_cnt', 'tushy_cnt', 'x_art_cnt']
|
||||
writer.writerow(header)
|
||||
for entry in data:
|
||||
# 确保 performer_aka 始终为列表类型
|
||||
performer_aka = entry.get('performer_aka', [])
|
||||
|
||||
# 如果是 None 或非列表类型,转换为一个空列表
|
||||
if performer_aka is None:
|
||||
performer_aka = []
|
||||
elif not isinstance(performer_aka, list):
|
||||
performer_aka = [performer_aka]
|
||||
|
||||
writer.writerow([
|
||||
entry.get('person', ''),
|
||||
entry.get('href', ''),
|
||||
'|'.join(performer_aka),
|
||||
entry.get('birthday', ''),
|
||||
entry.get('astrology', ''),
|
||||
entry.get('birthplace', ''),
|
||||
entry.get('gender', ''),
|
||||
entry.get('years_active', ''),
|
||||
entry.get('ethnicity', ''),
|
||||
entry.get('nationality', ''),
|
||||
entry.get('hair_colors', ''),
|
||||
entry.get('eye_color', ''),
|
||||
entry.get('height', ''),
|
||||
entry.get('weight', ''),
|
||||
entry.get('measurements', ''),
|
||||
entry.get('tattoos', ''),
|
||||
entry.get('piercings', ''),
|
||||
entry.get('movies_cnt', 0),
|
||||
entry.get('vixen_cnt', 0),
|
||||
entry.get('blacked_cnt', 0),
|
||||
entry.get('tushy_cnt', 0),
|
||||
entry.get('x_art_cnt', 0)
|
||||
])
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing to CSV: {e}")
|
||||
|
||||
def handle_exit_signal(signal, frame):
|
||||
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
|
||||
write_to_csv(final_data) # Ensure final data is written when exiting
|
||||
write_to_detail_json(final_data)
|
||||
sys.exit(0)
|
||||
|
||||
# 创建目录
|
||||
def create_directory_for_person(person):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
person_dir = person[:1].lower()
|
||||
full_path = os.path.join(performers_dir, person_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_person_json(person, href, data):
|
||||
# 获取目录
|
||||
person_dir = create_directory_for_person(person)
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
# 指定url访问
|
||||
def process_one(href):
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
# 获取并解析数据
|
||||
while True:
|
||||
data, movies = fetch_and_parse_page(href, scraper)
|
||||
if data is None:
|
||||
logging.warning(f'Retring {href} ')
|
||||
time.sleep(3)
|
||||
else:
|
||||
break
|
||||
|
||||
# 写入 performer 的独立 JSON 文件
|
||||
full_data = {
|
||||
**data,
|
||||
'credits': movies if movies else {}
|
||||
}
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person_id}.json" # 用 - 替换空格
|
||||
|
||||
try:
|
||||
with open(person_filename, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(full_data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {person_filename}: {e}")
|
||||
print(f'fetch succ. saved result in {person_filename}')
|
||||
|
||||
def process_all():
|
||||
# 初始化 cloudscraper
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 加载已存在的 href 列表
|
||||
global final_data
|
||||
existing_hrefs = load_existing_hrefs()
|
||||
logging.info(f"load data from {res_json_file}, count: {len(final_data)}")
|
||||
|
||||
# 读取 merged.json
|
||||
with open(input_json_file, 'r') as file:
|
||||
merged_data = json.load(file)
|
||||
|
||||
# 遍历 merged.json 中的数据
|
||||
loop = 0
|
||||
for entry in merged_data:
|
||||
href = entry.get('href')
|
||||
person = entry.get('person')
|
||||
|
||||
if href in existing_hrefs:
|
||||
logging.info(f"Skipping {href} - already processed")
|
||||
continue
|
||||
|
||||
logging.info(f"Processing {href} - {person}")
|
||||
|
||||
# 获取并解析数据
|
||||
while True:
|
||||
data, credits = fetch_and_parse_page(href, scraper)
|
||||
if data is None:
|
||||
logging.warning(f'Retring {href} - {person} ')
|
||||
time.sleep(3)
|
||||
else:
|
||||
break
|
||||
|
||||
# 如果数据正确,加入到 final_data
|
||||
final_data.append({
|
||||
'href': href,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
|
||||
# 写入 performer 的独立 JSON 文件
|
||||
full_data = {
|
||||
'href': href,
|
||||
'person': person,
|
||||
**data,
|
||||
'credits': credits if credits else {}
|
||||
}
|
||||
write_person_json(person.strip(), href, full_data)
|
||||
|
||||
# 更新 detail.json 文件
|
||||
loop = loop + 1
|
||||
if loop % 100 == 0:
|
||||
logging.info(f'flush data to json file. now fetched data count: {loop}, total count: {len(final_data)}')
|
||||
write_to_detail_json(final_data)
|
||||
write_to_csv(final_data)
|
||||
|
||||
# 更新已存在的 href
|
||||
existing_hrefs.add(href)
|
||||
|
||||
# 延时,防止请求过快被封锁
|
||||
time.sleep(1)
|
||||
|
||||
# 全量访问
|
||||
def main():
|
||||
try:
|
||||
# 注册退出信号
|
||||
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
|
||||
process_all()
|
||||
finally:
|
||||
# 清理操作,保证在程序正常退出时执行
|
||||
write_to_csv(final_data) # Write to CSV or other necessary tasks
|
||||
write_to_detail_json(final_data) # Save data to JSON
|
||||
logging.info("Data processing completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
url = sys.argv[1]
|
||||
process_one(url)
|
||||
else:
|
||||
main()
|
||||
140
iafd/src_json/performers_list_astro.py
Normal file
140
iafd/src_json/performers_list_astro.py
Normal file
@ -0,0 +1,140 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import cloudscraper
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
base_url = f"{host_url}/astrology.rme/sign="
|
||||
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 结果路径
|
||||
res_dir = '../result'
|
||||
|
||||
# 记录 ethinc_map
|
||||
astro_map = []
|
||||
|
||||
# 网络请求并解析 HTML
|
||||
def fetch_page(url):
|
||||
try:
|
||||
response = scraper.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to fetch {url}: {e}")
|
||||
return None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page(html, astro):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
astro_div = soup.find("div", id="astro")
|
||||
|
||||
if not astro_div:
|
||||
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||||
return None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
|
||||
birth_date = None
|
||||
for elem in astro_div.find_all(recursive=False):
|
||||
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||||
birth_date = elem.get_text(strip=True)
|
||||
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||||
a_tag = elem.find("a")
|
||||
if a_tag:
|
||||
href = host_url + a_tag["href"]
|
||||
name = a_tag.find("span", class_="perfname")
|
||||
if name:
|
||||
astro_map.append({
|
||||
"astrology": astro,
|
||||
"birth_date": birth_date,
|
||||
"person": name.get_text(strip=True),
|
||||
"href": href
|
||||
})
|
||||
flag = True
|
||||
list_cnt = list_cnt +1
|
||||
if flag:
|
||||
logging.info(f"get {list_cnt} persons from this page. total persons: {len(astro_map)}")
|
||||
return soup
|
||||
else:
|
||||
return None
|
||||
|
||||
# 处理翻页,星座的无需翻页
|
||||
def handle_pagination(soup, astro):
|
||||
return None
|
||||
|
||||
# 主逻辑函数:循环处理每个种族
|
||||
def process_astro_data():
|
||||
for astro in astro_list:
|
||||
url = base_url + astro
|
||||
next_url = url
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
while next_url:
|
||||
html = fetch_page(next_url)
|
||||
if html:
|
||||
soup = parse_page(html, astro)
|
||||
if soup:
|
||||
next_url = handle_pagination(soup, astro)
|
||||
else:
|
||||
logging.info(f"wrong html content. retring {next_url} ...")
|
||||
# 定期保存结果
|
||||
save_data()
|
||||
time.sleep(2) # 控制访问频率
|
||||
else:
|
||||
logging.info(f"Retrying {next_url} ...")
|
||||
time.sleep(5) # 等待后再重试
|
||||
|
||||
# 保存到文件
|
||||
def save_data():
|
||||
with open(f'{res_dir}/astro.json', 'w', encoding='utf-8') as json_file:
|
||||
json.dump(astro_map, json_file, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(f'{res_dir}/astro.csv', 'w', newline='', encoding='utf-8') as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=['astrology', 'birth_date', 'person', 'href'])
|
||||
writer.writeheader()
|
||||
writer.writerows(astro_map)
|
||||
|
||||
# 执行主逻辑
|
||||
if __name__ == '__main__':
|
||||
process_astro_data()
|
||||
save_data()
|
||||
logging.info("Data fetching and saving completed.")
|
||||
152
iafd/src_json/performers_list_birth.py
Normal file
152
iafd/src_json/performers_list_birth.py
Normal file
@ -0,0 +1,152 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import requests
|
||||
import cloudscraper
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
# 创建 cloudscraper 会话
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 结果路径
|
||||
res_dir = '../result'
|
||||
|
||||
# 存储出生日期的映射
|
||||
birth_map = []
|
||||
|
||||
# 设置基础URL
|
||||
host_url = "https://www.iafd.com"
|
||||
base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||
|
||||
# 定义获取页面内容的函数
|
||||
def fetch_page(month, day):
|
||||
url = base_url.format(month=month, day=day)
|
||||
retries = 3
|
||||
while retries > 0:
|
||||
try:
|
||||
# 发送请求并获取页面
|
||||
logging.info(f"Fetching URL: {url}")
|
||||
response = scraper.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Request failed: {e}")
|
||||
retries -= 1
|
||||
time.sleep(2) # 等待2秒后重试
|
||||
return None
|
||||
|
||||
# 解析页面内容并更新birth_map
|
||||
def parse_page(html, month, day):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||||
if not datarows:
|
||||
return None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||||
for row in rows:
|
||||
link_tag = row.find('a')
|
||||
person = link_tag.text.strip() if link_tag else ''
|
||||
href = link_tag['href'] if link_tag else ''
|
||||
href = host_url + href
|
||||
|
||||
# 如果 href 已经在 birth_map 中,跳过
|
||||
flag = True
|
||||
if any(entry['href'] == href for entry in birth_map):
|
||||
continue
|
||||
|
||||
# 将数据添加到 birth_map
|
||||
birth_map.append({
|
||||
'month': month,
|
||||
'day': day,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
list_cnt = list_cnt +1
|
||||
|
||||
if flag:
|
||||
logging.info(f"get {list_cnt} persons from this page. total persons: {len(birth_map)}")
|
||||
return soup
|
||||
else:
|
||||
return None
|
||||
|
||||
# 循环遍历每个日期
|
||||
def fetch_birthdays():
|
||||
for month in range(1, 13): # 遍历1到12月
|
||||
for day in range(1, 32): # 遍历1到31天
|
||||
logging.info(f"Processing: Month {month}, Day {day}")
|
||||
while True:
|
||||
html = fetch_page(month, day)
|
||||
if html:
|
||||
soup = parse_page(html, month, day)
|
||||
if soup:
|
||||
# 定期保存结果
|
||||
save_data()
|
||||
# 跳出while循环,获取下一个生日的url数据
|
||||
time.sleep(2) # 控制访问频率
|
||||
break
|
||||
else:
|
||||
logging.warning(f"No data. Retrying: Month {month}, Day {day}")
|
||||
time.sleep(3) # 等待后再重试
|
||||
else:
|
||||
logging.warning(f"Network error. Retrying: Month {month}, Day {day}")
|
||||
time.sleep(3) # 等待后再重试
|
||||
|
||||
|
||||
|
||||
# 将birth_map保存到json文件
|
||||
def save_data():
|
||||
with open(f'{res_dir}/birth.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(birth_map, f, ensure_ascii=False, indent=4)
|
||||
|
||||
with open(f'{res_dir}/birth.csv', 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['month', 'day', 'person', 'href'])
|
||||
writer.writeheader()
|
||||
for entry in birth_map:
|
||||
writer.writerow(entry)
|
||||
|
||||
# 主函数
|
||||
def main():
|
||||
# 获取数据
|
||||
fetch_birthdays()
|
||||
|
||||
# 保存结果
|
||||
save_data()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
166
iafd/src_json/performers_list_ethnic.py
Normal file
166
iafd/src_json/performers_list_ethnic.py
Normal file
@ -0,0 +1,166 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import cloudscraper
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
base_url = f"{host_url}/lookupethnic.rme/ethnic="
|
||||
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# 结果路径
|
||||
res_dir = '../result'
|
||||
|
||||
# 记录 ethinc_map
|
||||
ethnic_map = []
|
||||
|
||||
# 网络请求并解析 HTML
|
||||
def fetch_page(url):
|
||||
try:
|
||||
response = scraper.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to fetch {url}: {e}")
|
||||
return None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page(html, ethnic):
|
||||
# 手动修复 HTML 标签
|
||||
html = html.replace('<br>', '').replace('<a ', '<a target="_blank" ') # 修复一些不规范标签
|
||||
soup = BeautifulSoup(html, 'lxml') # 使用lxml解析器
|
||||
|
||||
#soup = BeautifulSoup(html, 'html.parser')
|
||||
rows = soup.find_all('div', class_='row headshotrow')
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
|
||||
for row in rows:
|
||||
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||||
link_tag = col.find('a')
|
||||
img_tag = col.find('div', class_='pictag')
|
||||
flag = True
|
||||
|
||||
if link_tag and img_tag:
|
||||
href = host_url + link_tag['href']
|
||||
person = img_tag.text.strip()
|
||||
|
||||
# 将数据存储到 ethnic_map
|
||||
ethnic_map.append({
|
||||
'ethnic': ethnic,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
list_cnt = list_cnt +1
|
||||
if flag:
|
||||
logging.info(f"get {list_cnt} persons from this page. total persons: {len(ethnic_map)}")
|
||||
return soup
|
||||
else:
|
||||
return None
|
||||
|
||||
# 处理翻页
|
||||
def handle_pagination(soup, ethnic):
|
||||
next_page = soup.find('a', rel='next')
|
||||
|
||||
if next_page:
|
||||
next_url = host_url + next_page['href']
|
||||
logging.info(f"Found next page: {next_url}")
|
||||
return next_url
|
||||
else:
|
||||
logging.info(f"All pages fetched for {ethnic}.")
|
||||
return None
|
||||
|
||||
# 处理带空格的种族名
|
||||
def format_ethnic(ethnic):
|
||||
return ethnic.replace(' ', '+')
|
||||
|
||||
# 主逻辑函数:循环处理每个种族
|
||||
def process_ethnic_data():
|
||||
all_person = len(ethnic_map) # 应该为0
|
||||
all_pages = 0
|
||||
|
||||
for ethnic in ethnic_list:
|
||||
url = base_url + format_ethnic(ethnic)
|
||||
next_url = url
|
||||
cursor = int(all_person / 100)
|
||||
pages = 0
|
||||
logging.info(f"--------Fetching data for {ethnic}, url {url} ...")
|
||||
|
||||
while next_url:
|
||||
html = fetch_page(next_url)
|
||||
if html:
|
||||
soup = parse_page(html, ethnic)
|
||||
if soup:
|
||||
next_url = handle_pagination(soup, ethnic)
|
||||
pages = pages + 1
|
||||
else:
|
||||
logging.info(f"wrong html content. retring {next_url} ...")
|
||||
# 统计,并定期保存结果
|
||||
if len(ethnic_map) / 100 > cursor:
|
||||
cursor = int(len(ethnic_map) / 100)
|
||||
save_data()
|
||||
time.sleep(2) # 控制访问频率
|
||||
else:
|
||||
logging.info(f"Retrying {next_url} ...")
|
||||
time.sleep(5) # 等待后再重试
|
||||
# 统计输出
|
||||
ethnic_person = len(ethnic_map) - all_person
|
||||
all_person = len(ethnic_map)
|
||||
all_pages = all_pages + pages
|
||||
logging.info(f"--------Fetching data for {ethnic} end. total pages: {pages}, total persons: {ethnic_person}, all persons fetched: {all_person}")
|
||||
# 统计最后结果
|
||||
logging.info(f"--------Fetching all data end. total ethnic: {len(ethnic_list)}, total pages: {all_pages}, total persons: {all_person}")
|
||||
|
||||
|
||||
# 保存到文件
|
||||
def save_data():
|
||||
with open(f'{res_dir}/ethnic.json', 'w', encoding='utf-8') as json_file:
|
||||
json.dump(ethnic_map, json_file, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(f'{res_dir}/ethnic.csv', 'w', newline='', encoding='utf-8') as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=['ethnic', 'person', 'href'])
|
||||
writer.writeheader()
|
||||
writer.writerows(ethnic_map)
|
||||
|
||||
# 执行主逻辑
|
||||
if __name__ == '__main__':
|
||||
process_ethnic_data()
|
||||
save_data()
|
||||
logging.info("Data fetching and saving completed.")
|
||||
120
iafd/src_json/performers_list_merge.py
Normal file
120
iafd/src_json/performers_list_merge.py
Normal file
@ -0,0 +1,120 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
import os
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
# 结果路径
|
||||
res_dir = '../result'
|
||||
|
||||
# 读取文件并返回内容
|
||||
def read_json(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"文件 {file_path} 未找到.")
|
||||
return []
|
||||
except json.JSONDecodeError:
|
||||
print(f"文件 {file_path} 解析错误.")
|
||||
return []
|
||||
|
||||
# 处理数据,去重并合并 person 字段
|
||||
def process_data(files):
|
||||
href_map = defaultdict(list)
|
||||
|
||||
# 读取并处理每个文件
|
||||
for file in files:
|
||||
data = read_json(file['path'])
|
||||
for entry in data:
|
||||
href = entry.get('href')
|
||||
person = entry.get('person')
|
||||
if href:
|
||||
href_map[href].append(person)
|
||||
|
||||
# 合并相同 href 的 person,连接用 "|"
|
||||
result = []
|
||||
for href, persons in href_map.items():
|
||||
person = '|'.join(set(persons)) # 去重后合并
|
||||
result.append({'href': href, 'person': person})
|
||||
|
||||
return result
|
||||
|
||||
# 保存结果到JSON文件
|
||||
def save_to_json(data, output_file):
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# 保存结果到CSV文件
|
||||
def save_to_csv(data, output_file):
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['href', 'person'])
|
||||
writer.writeheader()
|
||||
writer.writerows(data)
|
||||
|
||||
# 主函数,执行数据处理并保存
|
||||
def main():
|
||||
# 使用 argparse 获取命令行参数
|
||||
parser = argparse.ArgumentParser(description="合并多个 JSON 文件并输出到一个新的 JSON 和 CSV 文件")
|
||||
parser.add_argument('files', nargs='+', choices=['birth', 'astro', 'ethnic'],
|
||||
help="指定需要合并的文件, 至少两个, 最多三个: birth, astro, ethnic")
|
||||
args = parser.parse_args()
|
||||
|
||||
# 确保至少选择两个文件
|
||||
if len(args.files) < 2:
|
||||
print("请至少选择两个文件进行合并。")
|
||||
return
|
||||
|
||||
# 定义需要处理的文件
|
||||
file_map = {
|
||||
'birth': f'{res_dir}/birth.json',
|
||||
'astro': f'{res_dir}/astro.json',
|
||||
'ethnic': f'{res_dir}/ethnic.json'
|
||||
}
|
||||
|
||||
files = [{'path': file_map[file], 'name': file} for file in args.files]
|
||||
|
||||
# 处理数据
|
||||
processed_data = process_data(files)
|
||||
|
||||
# 根据输入的文件名生成 merged 文件名
|
||||
output_json_file = f'{res_dir}/merged_{"_".join(args.files)}.json'
|
||||
output_csv_file = f'{res_dir}/merged_{"_".join(args.files)}.csv'
|
||||
|
||||
# 确保 result 目录存在
|
||||
os.makedirs(f'{res_dir}', exist_ok=True)
|
||||
|
||||
# 输出结果到 JSON 和 CSV 文件
|
||||
save_to_json(processed_data, output_json_file)
|
||||
save_to_csv(processed_data, output_csv_file)
|
||||
|
||||
print(f"数据处理完成,结果已保存到 {output_json_file} 和 {output_csv_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
236
iafd/tools/data_merge.py
Normal file
236
iafd/tools/data_merge.py
Normal file
@ -0,0 +1,236 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 输入目录和输出文件
|
||||
input_dir = 'data'
|
||||
output_json_file = f'{input_dir}/iafd_merge.json'
|
||||
output_csv_file = f'{input_dir}/iafd_merge.csv'
|
||||
output_person_txt = f'{input_dir}/all_person.txt'
|
||||
|
||||
# 读取iafd_meta.json
|
||||
try:
|
||||
with open(os.path.join(input_dir, 'iafd_meta.json'), 'r', encoding='utf-8') as file:
|
||||
iafd_data = json.load(file)
|
||||
logger.info("Loaded iafd_meta.json")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading iafd_meta.json: {e}")
|
||||
iafd_data = []
|
||||
|
||||
# 读取stashdb.json
|
||||
try:
|
||||
with open(os.path.join(input_dir, 'stashdb.json'), 'r', encoding='utf-8') as file:
|
||||
stashdb_data = json.load(file)
|
||||
logger.info("Loaded stashdb.json")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading stashdb.json: {e}")
|
||||
stashdb_data = []
|
||||
|
||||
# 读取javhd_meta.json
|
||||
try:
|
||||
with open(os.path.join(input_dir, 'javhd_meta.json'), 'r', encoding='utf-8') as file:
|
||||
javhd_data = json.load(file)
|
||||
logger.info("Loaded javhd_meta.json")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading javhd_meta.json: {e}")
|
||||
javhd_data = []
|
||||
|
||||
# 读取thelordofporn_meta.json
|
||||
try:
|
||||
with open(os.path.join(input_dir, 'thelordofporn_meta.json'), 'r', encoding='utf-8') as file:
|
||||
lordporn_data = json.load(file)
|
||||
logger.info("Loaded thelordofporn_meta.json")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading thelordofporn_meta.json: {e}")
|
||||
lordporn_data = []
|
||||
|
||||
# 构建all_meta_data,去重
|
||||
all_meta_data = set()
|
||||
|
||||
# 从各数据源提取unique的姓名数据
|
||||
for person_entry in iafd_data:
|
||||
all_meta_data.add(person_entry['person'])
|
||||
for stashdb_entry in stashdb_data:
|
||||
all_meta_data.add(stashdb_entry['name'])
|
||||
for javhd_entry in javhd_data:
|
||||
all_meta_data.add(javhd_entry['ja_name'])
|
||||
for lordporn_entry in lordporn_data:
|
||||
all_meta_data.add(lordporn_entry['pornstar'])
|
||||
|
||||
# 合并数据的列表
|
||||
merged_data = []
|
||||
|
||||
# 遍历all_meta_data,按规则合并
|
||||
for person in all_meta_data:
|
||||
# 初始化合并的数据结构体
|
||||
merged_entry = {
|
||||
'person': person
|
||||
}
|
||||
|
||||
# 初始化stashdb_entry,所有字段为空
|
||||
stashdb_entry = {
|
||||
'stashdb_gender': '',
|
||||
'stashdb_birthdate': '',
|
||||
'stashdb_ethnicity': '',
|
||||
'stashdb_country': '',
|
||||
'stashdb_height': '',
|
||||
'stashdb_measurements': '',
|
||||
'stashdb_fake_tits': '',
|
||||
'stashdb_career_length': '',
|
||||
'stashdb_aliases': ''
|
||||
}
|
||||
|
||||
# 初始化javhd_entry,所有字段为空
|
||||
javhd_entry = {
|
||||
'javhd_rank': '',
|
||||
'javhd_height': '',
|
||||
'javhd_weight': '',
|
||||
'javhd_breast_size': '',
|
||||
'javhd_breast_factor': '',
|
||||
'javhd_birth_date': '',
|
||||
'javhd_ethnicity': ''
|
||||
}
|
||||
|
||||
# 初始化lordporn_entry,所有字段为空
|
||||
lordporn_entry = {
|
||||
'lordporn_rating': '',
|
||||
'lordporn_rank': '',
|
||||
'lordporn_career_start': '',
|
||||
'lordporn_measurements': '',
|
||||
'lordporn_born': '',
|
||||
'lordporn_height': '',
|
||||
'lordporn_weight': ''
|
||||
}
|
||||
|
||||
# 初始化in_iafd字段,默认为N
|
||||
in_iafd = 'N'
|
||||
iafd_match = next((item for item in iafd_data if item.get('person') == person), None)
|
||||
if iafd_match:
|
||||
in_iafd = 'Y'
|
||||
|
||||
# 1. 检查是否存在于 stashdb 数据
|
||||
in_stashdb = 'N'
|
||||
stashdb_match = next((item for item in stashdb_data if item.get('name') == person), None)
|
||||
if stashdb_match:
|
||||
in_stashdb = 'Y'
|
||||
# 更新stashdb_entry字段
|
||||
stashdb_entry.update({
|
||||
'stashdb_gender': stashdb_match.get('gender', ''),
|
||||
'stashdb_birthdate': stashdb_match.get('birthdate', ''),
|
||||
'stashdb_ethnicity': stashdb_match.get('ethnicity', ''),
|
||||
'stashdb_country': stashdb_match.get('country', ''),
|
||||
'stashdb_height': stashdb_match.get('height', ''),
|
||||
'stashdb_measurements': stashdb_match.get('measurements', ''),
|
||||
'stashdb_fake_tits': stashdb_match.get('fake_tits', ''),
|
||||
'stashdb_career_length': stashdb_match.get('career_length', ''),
|
||||
'stashdb_aliases': stashdb_match.get('aliases', '')
|
||||
})
|
||||
|
||||
# 2. 检查是否存在于 javhd 数据
|
||||
in_javhd = 'N'
|
||||
javhd_match = next((item for item in javhd_data if item.get('ja_name') == person), None)
|
||||
if javhd_match:
|
||||
in_javhd = 'Y'
|
||||
# 更新javhd_entry字段
|
||||
javhd_entry.update({
|
||||
'javhd_rank': javhd_match.get('rank', ''),
|
||||
'javhd_height': javhd_match.get('height', ''),
|
||||
'javhd_weight': javhd_match.get('weight', ''),
|
||||
'javhd_breast_size': javhd_match.get('breast size', ''),
|
||||
'javhd_breast_factor': javhd_match.get('breast factor', ''),
|
||||
'javhd_birth_date': javhd_match.get('birth date', ''),
|
||||
'javhd_ethnicity': javhd_match.get('ethnicity', '')
|
||||
})
|
||||
|
||||
# 3. 检查是否存在于 thelordofporn 数据
|
||||
in_lordporn = 'N'
|
||||
lordporn_match = next((item for item in lordporn_data if item.get('pornstar') == person), None)
|
||||
if lordporn_match:
|
||||
in_lordporn = 'Y'
|
||||
# 更新lordporn_entry字段
|
||||
lordporn_entry.update({
|
||||
'lordporn_rating': lordporn_match.get('rating', ''),
|
||||
'lordporn_rank': lordporn_match.get('rank', ''),
|
||||
'lordporn_career_start': lordporn_match.get('career_start', ''),
|
||||
'lordporn_measurements': lordporn_match.get('measurements', ''),
|
||||
'lordporn_born': lordporn_match.get('born', ''),
|
||||
'lordporn_height': lordporn_match.get('height', ''),
|
||||
'lordporn_weight': lordporn_match.get('weight', '')
|
||||
})
|
||||
|
||||
# 添加 in_stashdb, in_javhd, in_lordporn 字段,确保都输出
|
||||
merged_entry.update({
|
||||
'in_iafd': in_iafd,
|
||||
'in_stashdb': in_stashdb,
|
||||
'in_javhd': in_javhd,
|
||||
'in_lordporn': in_lordporn
|
||||
})
|
||||
|
||||
# 将stashdb_entry, javhd_entry, lordporn_entry合并到结果中
|
||||
merged_entry.update(stashdb_entry)
|
||||
merged_entry.update(javhd_entry)
|
||||
merged_entry.update(lordporn_entry)
|
||||
|
||||
# 将合并后的条目加入到结果列表
|
||||
merged_data.append(merged_entry)
|
||||
|
||||
# 写入iafd_merge.json
|
||||
try:
|
||||
with open(output_json_file, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(merged_data, json_file, ensure_ascii=False, indent=4)
|
||||
logger.info(f"Data successfully written to {output_json_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing {output_json_file}: {e}")
|
||||
|
||||
# 写入iafd_merge.csv
|
||||
try:
|
||||
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=merged_data[0].keys(), delimiter='\t')
|
||||
writer.writeheader()
|
||||
writer.writerows(merged_data)
|
||||
logger.info(f"Data successfully written to {output_csv_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing {output_csv_file}: {e}")
|
||||
|
||||
|
||||
# 输出 all_meta_data 到 all_person.txt,并按字母顺序排序
|
||||
try:
|
||||
# 排序 all_meta_data
|
||||
all_meta_data_list = sorted(list(all_meta_data)) # 将集合转换为列表并排序
|
||||
all_meta_data_str = ','.join(all_meta_data_list) # 使用逗号连接元素
|
||||
with open(output_person_txt, 'w', encoding='utf-8') as txt_file:
|
||||
txt_file.write(all_meta_data_str)
|
||||
logger.info(f"all_meta_data successfully written to all_person.txt")
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing all_person.txt: {e}")
|
||||
163
iafd/tools/iafd_scrape.py
Normal file
163
iafd/tools/iafd_scrape.py
Normal file
@ -0,0 +1,163 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
# 设置日志配置
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 预定义的 scrapers 目录
|
||||
scrapers_dir = "/root/gitlabs/stashapp_CommunityScrapers/scrapers"
|
||||
meta_file = "./data/iafd_meta.json"
|
||||
cursor_file = "./data/iafd_cursor.txt"
|
||||
output_dir = f"{scrapers_dir}/iafd_meta"
|
||||
|
||||
# 重试次数和间隔
|
||||
MAX_RETRIES = 10
|
||||
RETRY_DELAY = 5 # 5秒重试间隔
|
||||
|
||||
|
||||
# 创建输出目录
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
|
||||
def read_processed_hrefs() -> set:
|
||||
"""
|
||||
读取已经处理过的 href
|
||||
"""
|
||||
processed_hrefs = set()
|
||||
if os.path.exists(cursor_file):
|
||||
with open(cursor_file, "r", encoding="utf-8") as f:
|
||||
processed_hrefs = {line.strip().split(",")[1] for line in f if "," in line}
|
||||
return processed_hrefs
|
||||
|
||||
|
||||
def execute_scraper_command(href: str, idv: str) -> bool:
|
||||
"""
|
||||
执行命令抓取数据,成功则返回True,否则返回False。
|
||||
包含重试机制。
|
||||
"""
|
||||
command = f"cd {scrapers_dir}; python3 -m IAFD.IAFD performer {href} > {output_dir}/{idv}.json"
|
||||
attempt = 0
|
||||
while attempt < MAX_RETRIES:
|
||||
try:
|
||||
logger.info(f"执行命令: {command}")
|
||||
subprocess.run(command, shell=True, check=True)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"执行命令失败: {e}. 重试 {attempt + 1}/{MAX_RETRIES}...")
|
||||
time.sleep(RETRY_DELAY)
|
||||
attempt += 1
|
||||
logger.error(f"命令执行失败,已尝试 {MAX_RETRIES} 次: {command}")
|
||||
return False
|
||||
|
||||
|
||||
def validate_json_file(idv: str) -> bool:
|
||||
"""
|
||||
校验 JSON 文件是否有效
|
||||
"""
|
||||
output_file = f"{output_dir}/{idv}.json"
|
||||
try:
|
||||
with open(output_file, "r", encoding="utf-8") as f:
|
||||
content = f.read().strip()
|
||||
json_data = json.loads(content) # 尝试解析 JSON
|
||||
if "name" not in json_data:
|
||||
raise ValueError("缺少 'name' 字段")
|
||||
return True
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
logger.error(f"解析失败,删除无效文件: {output_file}. 错误: {e}")
|
||||
os.remove(output_file)
|
||||
return False
|
||||
|
||||
|
||||
def process_iafd_meta(data: List[dict], processed_hrefs: set) -> None:
|
||||
"""
|
||||
处理 iafd_meta.json 中的数据
|
||||
"""
|
||||
for entry in data:
|
||||
person = entry.get("person")
|
||||
href = entry.get("href")
|
||||
|
||||
if not person or not href:
|
||||
logger.warning(f"跳过无效数据: {entry}")
|
||||
continue
|
||||
|
||||
# 解析 href 提取 id
|
||||
try:
|
||||
idv = href.split("id=")[-1]
|
||||
except IndexError:
|
||||
logger.error(f"无法解析 ID: {href}")
|
||||
continue
|
||||
|
||||
output_file = f"{output_dir}/{idv}.json"
|
||||
|
||||
# 跳过已处理的 href
|
||||
if href in processed_hrefs:
|
||||
logger.info(f"已处理,跳过: {person}, {href}")
|
||||
continue
|
||||
|
||||
# 执行数据抓取
|
||||
if not execute_scraper_command(href, idv):
|
||||
continue
|
||||
|
||||
# 校验 JSON 文件
|
||||
if not validate_json_file(idv):
|
||||
continue
|
||||
|
||||
# 记录已处理数据
|
||||
with open(cursor_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"{person},{href}\n")
|
||||
|
||||
logger.info(f"成功处理: {person} - {href}")
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
主程序执行函数
|
||||
"""
|
||||
# 读取已处理的 href
|
||||
processed_hrefs = read_processed_hrefs()
|
||||
|
||||
# 读取 iafd_meta.json 数据
|
||||
try:
|
||||
with open(meta_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"读取 iafd_meta.json 错误: {e}")
|
||||
return
|
||||
|
||||
# 处理数据
|
||||
process_iafd_meta(data, processed_hrefs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
90
iafd/tools/stashdb_merge.py
Normal file
90
iafd/tools/stashdb_merge.py
Normal file
@ -0,0 +1,90 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
|
||||
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
|
||||
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
|
||||
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
|
||||
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
|
||||
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
|
||||
|
||||
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
|
||||
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
|
||||
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
|
||||
从而获取到一份完整的数据列表。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 输入和输出目录
|
||||
input_dir = 'data/tmp' # 假设metadata目录在当前目录下
|
||||
output_json_file = 'stashdb.json'
|
||||
output_csv_file = 'stashdb.csv'
|
||||
|
||||
# 用于保存所有的条目
|
||||
data_list = []
|
||||
|
||||
# 遍历metadata文件夹,读取所有json文件
|
||||
for filename in os.listdir(input_dir):
|
||||
if filename.endswith('.json'):
|
||||
file_path = os.path.join(input_dir, filename)
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# 提取需要的字段
|
||||
person = {
|
||||
'name': data.get('name'),
|
||||
'gender': data.get('gender'),
|
||||
'birthdate': data.get('birthdate'),
|
||||
'ethnicity': data.get('ethnicity'),
|
||||
'country': data.get('country'),
|
||||
'height': data.get('height'),
|
||||
'measurements': data.get('measurements'),
|
||||
'fake_tits': data.get('fake_tits'),
|
||||
'career_length': data.get('career_length'),
|
||||
'aliases': ', '.join(data.get('aliases', [])) # 连接aliases数组元素
|
||||
}
|
||||
|
||||
# 将数据添加到列表中
|
||||
data_list.append(person)
|
||||
logger.info(f"Processed file: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {filename}: {e}")
|
||||
|
||||
# 输出到 JSON 文件
|
||||
try:
|
||||
with open(output_json_file, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data_list, json_file, ensure_ascii=False, indent=4)
|
||||
logger.info(f"Data successfully written to {output_json_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing JSON file: {e}")
|
||||
|
||||
# 输出到 CSV 文件
|
||||
try:
|
||||
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=data_list[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(data_list)
|
||||
logger.info(f"Data successfully written to {output_csv_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing CSV file: {e}")
|
||||
Reference in New Issue
Block a user