import cloudscraper import time import json import csv import logging import signal import sys import os import re from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial import config # 定义基础 URL 和可变参数 host_url = "https://www.iafd.com" astr_base_url = f"{host_url}/astrology.rme/sign=" astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces'] birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}" ethnic_url = f"{host_url}/lookupethnic.rme/ethnic=" ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian'] distributors_list_url = f'{host_url}/distrib.asp' distributors_base_url = f"{host_url}/distrib.rme/distrib=" studios_list_url = f"{host_url}/studio.asp" studios_base_url = f"{host_url}/studio.rme/studio=" # 设置 headers 和 scraper headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } scraper = cloudscraper.create_scraper() #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): for attempt in range(max_retries): try: if host_url not in url.lower(): logging.error(f'wrong url format: {url}') return None response = scraper.get(url, headers=headers) response.raise_for_status() # 处理 HTTP 错误 # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(response.text) if preprocessor else response.text soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 return soup logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except cloudscraper.exceptions.CloudflareChallengeError as e: logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...") except cloudscraper.exceptions.CloudflareCode1020 as e: logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...") except Exception as e: logging.error(f"Unexpected error on {url}: {e}, Retring...") logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要 def preprocess_html(html): return html.replace('
', '').replace('