""" Script Name: Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。 list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全 list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全 list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的 list_merge.py 上面三个列表的数据,取交集,得到整体数据。 iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配) html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。 data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并; stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并 从而获取到一份完整的数据列表。 Author: [Your Name] Created Date: YYYY-MM-DD Last Modified: YYYY-MM-DD Version: 1.0 Modification History: - YYYY-MM-DD [Your Name]: - YYYY-MM-DD [Your Name]: - YYYY-MM-DD [Your Name]: """ import requests import cloudscraper import json import time import csv from bs4 import BeautifulSoup import logging import config config.setup_logging() # 创建 cloudscraper 会话 # 设置 headers 和 scraper headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } scraper = cloudscraper.create_scraper() # 结果路径 res_dir = '../result' # 存储出生日期的映射 birth_map = [] # 设置基础URL host_url = "https://www.iafd.com" base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}" # 定义获取页面内容的函数 def fetch_page(month, day): url = base_url.format(month=month, day=day) retries = 3 while retries > 0: try: # 发送请求并获取页面 logging.info(f"Fetching URL: {url}") response = scraper.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logging.error(f"Request failed: {e}") retries -= 1 time.sleep(2) # 等待2秒后重试 return None # 解析页面内容并更新birth_map def parse_page(html, month, day): soup = BeautifulSoup(html, 'html.parser') datarows = soup.find_all('div', class_='col-sm-12 col-lg-9') if not datarows: return None flag = False list_cnt = 0 rows = datarows[0].find_all('div', class_='col-sm-4') for row in rows: link_tag = row.find('a') person = link_tag.text.strip() if link_tag else '' href = link_tag['href'] if link_tag else '' href = host_url + href # 如果 href 已经在 birth_map 中,跳过 flag = True if any(entry['href'] == href for entry in birth_map): continue # 将数据添加到 birth_map birth_map.append({ 'month': month, 'day': day, 'person': person, 'href': href }) list_cnt = list_cnt +1 if flag: logging.info(f"get {list_cnt} persons from this page. total persons: {len(birth_map)}") return soup else: return None # 循环遍历每个日期 def fetch_birthdays(): for month in range(1, 13): # 遍历1到12月 for day in range(1, 32): # 遍历1到31天 logging.info(f"Processing: Month {month}, Day {day}") while True: html = fetch_page(month, day) if html: soup = parse_page(html, month, day) if soup: # 定期保存结果 save_data() # 跳出while循环,获取下一个生日的url数据 time.sleep(2) # 控制访问频率 break else: logging.warning(f"No data. Retrying: Month {month}, Day {day}") time.sleep(3) # 等待后再重试 else: logging.warning(f"Network error. Retrying: Month {month}, Day {day}") time.sleep(3) # 等待后再重试 # 将birth_map保存到json文件 def save_data(): with open(f'{res_dir}/birth.json', 'w', encoding='utf-8') as f: json.dump(birth_map, f, ensure_ascii=False, indent=4) with open(f'{res_dir}/birth.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['month', 'day', 'person', 'href']) writer.writeheader() for entry in birth_map: writer.writerow(entry) # 主函数 def main(): # 获取数据 fetch_birthdays() # 保存结果 save_data() if __name__ == '__main__': main()