resources/scrapy_proj/scrapy_proj/spiders/parser/iafd_parser.py


import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
#import config
#import utils

# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"

astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']

birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"

distributors_list_url = f'{host_url}/distrib.asp'
distributors_base_url = f"{host_url}/distrib.rme/distrib="

studios_list_url = f"{host_url}/studio.asp"
studios_base_url = f"{host_url}/studio.rme/studio="

ethnic_list_url = f'{host_url}/advsearch.asp'

# 设置 headers 和 scraper
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()

http_code_404   = 404
http_code_login = 401
http_code_url = 601
http_code_local = 99

save_raw_html = True
load_from_local = False

def common_parser(html, page, **kwargs):
    parser = "lxml" if page=='ethnic' else "html.parser"
    soup = BeautifulSoup(html, parser)
    if not soup:
        return None
    if page == 'astro':
        #parse_page_astro(soup, astro):
        return parse_page_astro(soup, **kwargs)
    elif page == 'birth':
        #parse_page_birth(soup, month, day):
        return parse_page_birth(soup, **kwargs)
    elif page == 'ethnic':
        #parse_page_ethnic(soup, ethnic):
        return parse_page_ethnic(soup, **kwargs)
    elif page == 'dist':
        return parse_page_dist_stu(soup,'distable')
    elif page == 'stu':
        return parse_page_dist_stu(soup,'studio')
    elif page == 'actor':
        #parse_page_performer(soup, url):
        return parse_page_performer(soup, **kwargs)
    elif page == 'movies':
        #parse_page_movie(soup, href, title)
        return parse_page_movie(soup, **kwargs)
    else:
        logging.warning(f"wrong page: {page}")
        return None

'''
#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
    if load_from_local:     # 从本地读取的逻辑
        html = utils.read_raw_html(url)
        if html:
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(html) if preprocessor else html

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的

    for attempt in range(max_retries):
        try:
            if host_url not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, http_code_url

            response = scraper.get(url, headers=headers)

            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.debug(f"Page not found (404): {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过

            response.raise_for_status()  # 处理 HTTP 错误

            # 过期的网页，与404相同处理
            if "invalid or outdated page" in response.text.lower():
                logging.debug(f"invalid or outdated page: {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过

            if save_raw_html:
                utils.write_raw_html(url, response.text)

            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code
            else:
                # 检查是否发生跳转，比如到登录页面
                if response.history:
                    logging.warning(f"Page redirected on {url}. Validation failed.")
                    return None, http_code_login

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")

    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败
'''

# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
def preprocess_html(html):
    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')

# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name":
        return bool(soup.find('select', {'name': identifier}))
    return False

# 检查电影信息是否存在
def movie_validator(soup, table_id):
    return soup.find("table", id=table_id) is not None

# 解析 HTML 内容，提取需要的数据
def parse_page_ethnic_list(soup, href):
    div_root = soup.find("select", id="ethnicity1")
    if not div_root:
        logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
        return None, None

    list_data = []

    # 提取所有的 <option> 标签
    options = div_root.find_all('option')
    if options:
        # 解析并输出 value 和文本内容
        for option in options:
            href = option.get('value', None)
            text = option.text.strip()
            if href and href.lower() == 'none':
                continue
            list_data.append({
                "name": text,
                "href": host_url + href if href else ''
            })
    return list_data


# 解析 HTML 内容，提取需要的数据
def parse_page_astro(soup, astro):
    astro_div = soup.find("div", id="astro")
    if not astro_div:
        logging.warning(f"Warning: No 'astro' div found in {astro}")
        return None, None

    flag = False
    list_cnt = 0
    list_data = []
    next_url = None

    birth_date = None
    for elem in astro_div.find_all(recursive=False):
        if elem.name == "h3" and "astroday" in elem.get("class", []):
            birth_date = elem.get_text(strip=True)
        elif elem.name == "div" and "perficon" in elem.get("class", []):
            a_tag = elem.find("a")
            if a_tag:
                href = host_url + a_tag["href"]
                name = a_tag.find("span", class_="perfname")
                if name:
                    list_data.append({
                        "astrology": astro,
                        "birth_date": birth_date,
                        "person": name.get_text(strip=True),
                        "href": href
                    })
                    flag = True
                    list_cnt = list_cnt +1
    if flag:
        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
        return list_data, next_url
    else:
        return None, None


# 解析页面内容并更新birth_map
def parse_page_birth(soup, month, day):
    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
    if not datarows:
        return None, None

    flag = False
    list_cnt = 0
    list_data = []
    next_url = None
    rows = datarows[0].find_all('div', class_='col-sm-4')
    for row in rows:
        link_tag = row.find('a')
        person = link_tag.text.strip() if link_tag else ''
        href = link_tag['href'] if link_tag else ''
        href = host_url + href

        # 如果 href 已经在 birth_map 中，跳过
        flag = True
        if any(entry['href'] == href for entry in list_data):
            continue

        # 将数据添加到 birth_map
        list_data.append({
            'month': month,
            'day': day,
            'person': person,
            'href': href
        })
        list_cnt = list_cnt +1

    if flag:
        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
        return list_data, next_url
    else:
        return None, None


# 解析 HTML 内容，提取需要的数据
def parse_page_ethnic(soup, ethnic):
    rows = soup.find_all('div', class_='row headshotrow')
    flag = False
    list_data = []
    next_url = None

    for row in rows:
        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
            link_tag = col.find('a')
            img_tag = col.find('div', class_='pictag')
            flag = True

            if link_tag and img_tag:
                href = host_url + link_tag['href']
                person = img_tag.text.strip()

                # 将数据存储到 ethnic_map
                list_data.append({
                    'ethnic': ethnic,
                    'person': person,
                    'href': href
                })
    if flag:
        logging.debug(f"get {len(list_data)} persons from this page.")

        next_page = soup.find('a', rel='next')
        if next_page:
            next_url = host_url + next_page['href']
            logging.debug(f"Found next page: {next_url}")
            return list_data, next_url
        else:
            logging.debug(f"All pages fetched for {ethnic}.")
            return list_data, None
    else:
        return None, None

# 解析列表页
def parse_page_dist_stu_list(soup, select_name):
    list_data = []
    next_url = None

    select_element = soup.find('select', {'name': select_name})
    if select_element :
        options = select_element.find_all('option')
        for option in options:
            value = option.get('value')  # 获取 value 属性
            text = option.text.strip()   # 获取文本内容
            list_data.append({
                'name' : text,
                'href' : str(value)
            })
        return list_data, next_url
    else:
        return None, None

# 解析 HTML 内容，提取需要的数据
def parse_page_dist_stu(soup, table_id):
    table = soup.find("table", id=table_id)
    if not table:
        logging.warning(f"Warning: No {table_id} table found ")
        return None, None

    # 找到thead并跳过
    thead = table.find('thead')
    if thead:
        thead.decompose()  # 去掉thead部分，不需要解析

    # 现在只剩下tbody部分
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else []

    list_data = []
    next_url = None
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 5:
            title = cols[0].text.strip()
            label = cols[1].text.strip()
            year = cols[2].text.strip()
            rev = cols[3].text.strip()
            a_href = cols[0].find('a')
            href = host_url + a_href['href'] if a_href else ''

            list_data.append({
                'title': title,
                'label': label,
                'year': year,
                'rev': rev,
                'href': href
            })
    return list_data, next_url


# 解析 作品列表，有个人出演，也有导演的
def parse_credits_table(table, distributor_list):
    # 找到thead并跳过
    thead = table.find('thead')
    if thead:
        thead.decompose()  # 去掉thead部分，不需要解析

    # 现在只剩下tbody部分
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else []

    movies = []
    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数

    # rows = table.find_all('tr', class_='we')
    for row in rows:
        #tr_class = row.get('class', '')  # 获取 class 属性，如果没有则返回空字符串
        tr_class = ' '.join(row.get('class', []))  # 获取 class 属性，如果没有则返回空字符串
        cols = row.find_all('td')
        if len(cols) >= 6:
            title = cols[0].text.strip()
            href_a = cols[0].find('a')
            href = href_a['href'] if href_a else ''
            year = cols[1].text.strip()
            distributor = cols[2].text.strip().lower()
            href_d = cols[2].find('a')
            href_dist = host_url + href_d['href'] if href_d else ''
            notes = cols[3].text.strip()
            rev = cols[4].text.strip()
            formats = cols[5].text.strip()

            for key in distributor_list:
                if key in distributor:
                    distributor_count[key] += 1

            movies.append({
                'title': title,
                'href' : href,
                'year': year,
                'distributor': distributor,
                'distributor_href': href_dist,
                'notes': notes,
                'rev': rev,
                'formats': formats,
                'tr_class': tr_class
            })
    return movies, distributor_count


# 请求网页并提取所需数据
def parse_page_performer(soup, url):
    # 提取数据
    data = {}

    # 定义我们需要的字段名称和HTML中对应的标签
    fields = {
        'performer_aka': 'Performer AKA',
        'birthday': 'Birthday',
        'astrology': 'Astrology',
        'birthplace': 'Birthplace',
        'gender': 'Gender',
        'years_active': 'Years Active',
        'ethnicity': 'Ethnicity',
        'nationality': 'Nationality',
        'hair_colors': 'Hair Colors',
        'eye_color': 'Eye Color',
        'height': 'Height',
        'weight': 'Weight',
        'measurements': 'Measurements',
        'tattoos': 'Tattoos',
        'piercings': 'Piercings'
    }
    reversed_map = {v: k for k, v in fields.items()}

    # 解析表格数据, 获取参演或者导演的列表
    role_list = ['personal', 'directoral']
    distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
    credits_list = {}

    # 使用字典来存储统计
    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
    for role in role_list:
        table = soup.find('table', id=role)
        if table :
            movies, stat_map = parse_credits_table(table, distributor_list)
            credits_list[role] = movies
            # 更新 distributor 统计
            for distributor in distributor_list:
                distributor_count[distributor] += stat_map.get(distributor, 0)

    # 统计 movies 数量
    #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
    movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))

    # 如果没有找到
    if len(credits_list) == 0 :
        logging.warning(f"movie table empty. url: {url} ")

    # 遍历每个 bioheading, 获取metadata
    bioheadings = soup.find_all('p', class_='bioheading')
    for bio in bioheadings:
        heading = bio.text.strip()
        biodata = None

        # 如果包含 "Performer",需要特殊处理
        if 'Performer' in heading:
            heading = 'Performer AKA'
            biodata_div = bio.find_next('div', class_='biodata')
            if biodata_div:
                div_text = biodata_div.get_text(separator='|').strip()
                biodata = [b.strip() for b in div_text.split('|') if b.strip()]
        else:
            biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''

        # 保存数据
        if heading in reversed_map:
            kkey = reversed_map[heading]
            data[kkey] = biodata

    # 添加统计数据到 data
    data['movies_cnt'] = movies_cnt
    data['vixen_cnt'] = distributor_count['vixen']
    data['blacked_cnt'] = distributor_count['blacked']
    data['tushy_cnt'] = distributor_count['tushy']
    data['x_art_cnt'] = distributor_count['x-art']
    data['credits'] = credits_list

    return data


# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
    # 解析电影基础信息
    movie_data = {}
    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
    if info_div:
        labels = info_div.find_all("p", class_="bioheading")
        values = info_div.find_all("p", class_="biodata")
        for label, value in zip(labels, values):
            key = label.text.strip()
            if key == "Directors":  # 解析多位导演的情况
                directors = []
                links = value.find_all("a")
                for link in links:
                    director_name = link.text.strip()
                    director_href = host_url + link['href'] if link['href'] else ''
                    directors.append({"name": director_name, "href": director_href})
                movie_data[key] = directors
            else:
                val = value.text.strip()
                if key in ["Distributor", "Studio", "Director"]:
                    link = value.find("a")
                    if link:
                        val = link.text.strip()
                        movie_data[f'{key}Href'] = host_url + link['href']
                movie_data[key] = val
    else:
        return None

    # 解析演职人员信息
    performers = []
    cast_divs = soup.find_all("div", class_="castbox")
    for cast in cast_divs:
        performer = {}
        link = cast.find("a")
        if link:
            performer["name"] = link.text.strip()
            performer["href"] =  host_url + link["href"]

        #performer["tags"] = [
        #    tag.strip() for br in cast.find_all("br")
        #    if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
        #]

        tags = []
        for br in cast.find_all("br"):
            tag = br.next_sibling
            if isinstance(tag, str) and tag.strip():
                tags.append(tag.strip())
        performer["tags"] = tags

        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
        performers.append(performer)

    # 解析场景拆解
    scene_breakdowns = []
    scene_table = soup.find("div", id="sceneinfo")
    if scene_table:
        rows = scene_table.find_all("tr")

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                scene = cols[0].text.strip()  # 场景编号
                performer_info = cols[1]  # 包含表演者及链接信息

                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
                performer_html = str(performer_info)  # 获取所有HTML内容
                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
                if split_html:
                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
                else:
                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
                    if split_html:
                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
                    else:
                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部

                # 解析为纯文本（去除HTML标签，仅提取文本内容）
                performers_soup = BeautifulSoup(performers_html, "html.parser")
                performers_text = performers_soup.get_text()

                # 提取表演者
                scene_performers = [p.strip() for p in performers_text.split(",")]

                # 尝试获取 `webscene` 和 `studio`
                links_data = {}
                links = performer_info.find_all("a")
                if links:
                    webscene_title = links[0].text.strip() if len(links)>0 else None
                    webscene = links[0]["href"] if len(links)>0 else None
                    studio = links[1].text.strip() if len(links)>1 else None
                    studio_lnk = links[1]["href"] if len(links)>1 else None
                    links_data = {
                        "title": webscene_title,
                        "webscene": webscene,
                        "studio": studio,
                        "studio_lnk": studio_lnk,
                    }

                scene_data = {
                    "scene": scene,
                    "performers": scene_performers,
                    **links_data,
                }
                scene_breakdowns.append(scene_data)

    appears_in = []
    appears_divs = soup.find("div", id="appearssection")
    if appears_divs:
        rows = appears_divs.find_all("li")
        for row in rows:
            lnk = row.find("a")
            if lnk:
                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})


    return {
        "href": href,
        "title": title,
        "Minutes": movie_data.get("Minutes", ""),
        "Distributor": movie_data.get("Distributor", ""),
        "Studio": movie_data.get("Studio", ""),
        "ReleaseDate": movie_data.get("Release Date", ""),
        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
        "All-Girl": movie_data.get("All-Girl", ""),
        "All-Male": movie_data.get("All-Male", ""),
        "Compilation": movie_data.get("Compilation", ""),
        "Webscene": movie_data.get("Webscene", ""),
        "Director": movie_data.get("Director", ""),
        "DirectorHref": movie_data.get("DirectorHref", ""),
        "DistributorHref": movie_data.get("DistributorHref", ""),
        "StudioHref": movie_data.get("StudioHref", ""),
        "Directors": movie_data.get("Directors", []),   # 可能存在的元素
        "Performers": performers,
        "SceneBreakdowns": scene_breakdowns,
        "AppearsIn": appears_in,
    }


if __name__ == "__main__":

    for astro in astro_list:
        url = astr_base_url + astro
        next_url = url
        logging.info(f"Fetching data for {astro}, url {url} ...")

        while True:
            soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
            if soup:
                list_data, next_url = parse_page_astro(soup, astro)
                if list_data:
                    print(list_data[0] if len(list_data)>0 else 'no data')
                    break
            else:
                logging.info(f"Retrying {next_url} ...")
                time.sleep(5)  # 等待后再重试

        time.sleep(2)  # 控制访问频率