This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
2025-07-03 16:07:47 +08:00

259 lines
12 KiB
Python

import scrapy
import re
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
db_tools = IAFDQuery()
class IAFDSpider(scrapy.Spider):
name = "iafd"
allowed_domains = ["iafd.com"]
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
distributors_list_url = f'{host_url}/distrib.asp'
studios_list_url = f"{host_url}/studio.asp"
ethnic_list_url = f'{host_url}/advsearch.asp'
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.cmd_str = cmd
self.update = int(update)
self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}")
self.cmd_astro = 'astro'
self.cmd_birth = 'birth'
self.cmd_ethnic = 'ethnic'
self.cmd_dist = 'dist'
self.cmd_stu = 'stu'
self.cmd_performers = 'performers'
self.cmd_movies = 'movies'
self.cmd_list = self.cmd_str.split(',')
if len(self.cmd_list) == 0 :
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
def start_requests(self):
# 根据命令字执行
if self.cmd_astro in self.cmd_list:
self.start_astro()
# 按生日获取演员列表
if self.cmd_birth in self.cmd_list:
self.start_birth()
# 获取人种列表
if self.cmd_ethnic in self.cmd_list:
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
# 获取 distributors 列表
if self.cmd_dist in self.cmd_list:
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
# 获取 studios 列表
if self.cmd_stu in self.cmd_list:
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
query_args = {}
if self.debug:
query_args['limit'] = 5
if self.update == 0:
query_args['is_full_data'] = 0
# 读取待更新的演员列表
if self.cmd_performers in self.cmd_list:
actors = db_tools.get_performers(**query_args)
if actors:
for item in actors:
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
# 读取待更新的影片列表
if self.cmd_movies in self.cmd_list:
movies = db_tools.get_movies(**query_args)
if movies:
for item in movies:
href = item.get('href', '')
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
def start_astro(self):
# 按星座获取演员列表
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
def start_birth(self):
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
async def start(self):
# 调用原有 start_requests 方法
async for request in super().start():
yield request
def parse_astro_page(self, response):
astro = response.meta['astro']
astro_div = response.css('div#astro')
if astro_div:
birth_date = None
for elem in astro_div.css('*'):
if elem.css('h3.astroday'):
birth_date = elem.css('h3.astroday::text').get().strip()
elif elem.css('div.perficon'):
a_tag = elem.css('a')
if a_tag:
href = self.host_url + a_tag.attrib['href']
name = a_tag.css('span.perfname::text').get()
if name:
item = IAFDPersonItem()
item['name'] = name
item['href'] = href
item['from_astro_list'] = 1
item['from_birth_list'] = 0
item['from_ethnic_list'] = 0
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
def parse_birth_page(self, response):
month = response.meta['month']
day = response.meta['day']
datarows = response.css('div.col-sm-12.col-lg-9')
if datarows:
rows = datarows[0].css('div.col-sm-4')
for row in rows:
link_tag = row.css('a')
person = link_tag.css('::text').get().strip() if link_tag else ''
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
item = IAFDPersonItem()
item['name'] = person
item['href'] = href
item['from_astro_list'] = 0
item['from_birth_list'] = 1
item['from_ethnic_list'] = 0
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
def parse_ethnic_list_page(self, response):
div_root = response.css('select#ethnicity1')
if div_root:
options = div_root.css('option')
for option in options:
href = option.attrib.get('value')
text = option.css('::text').get().strip()
if href and href.lower() != 'none':
ethnic_url = self.host_url + href
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
if self.debug:
break
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
rows = response.css('div.row.headshotrow')
for row in rows:
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
for col in cols:
link_tag = col.css('a')
img_tag = col.css('div.pictag')
if link_tag and img_tag:
href = self.host_url + link_tag.attrib['href']
person = img_tag.css('::text').get().strip()
item = IAFDPersonItem()
item['name'] = person
item['href'] = href
item['from_astro_list'] = 0
item['from_birth_list'] = 0
item['from_ethnic_list'] = 1
item['from_movie_list'] = 0
yield item
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
next_page = response.css('a[rel="next"]')
if next_page:
next_url = self.host_url + next_page.attrib['href']
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
if select_element:
options = select_element.css('option')
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
item = IAFDMovieItem()
item['title'] = text
item['href'] = dis_url
item['release_year'] = 0
item['from_performer_list'] = 0
item['from_dist_list'] = 1
item['from_stu_list'] = 0
yield item
yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
if select_element:
options = select_element.css('option')
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
stu_url = self.host_url + f"/studio.rme/studio={value}"
item = IAFDMovieItem()
item['title'] = text
item['href'] = stu_url
item['release_year'] = 0
item['from_performer_list'] = 0
item['from_dist_list'] = 0
item['from_stu_list'] = 1
yield item
yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
def parse_person_detail_page(self, response):
item = IAFDPersonDetailItem()
item['href'] = response.url
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
# 解析其他详细信息,根据实际页面结构修改
item['gender'] = response.css('span.gender::text').get()
item['birthday'] = response.css('span.birthday::text').get()
item['astrology'] = response.css('span.astrology::text').get()
item['birthplace'] = response.css('span.birthplace::text').get()
item['years_active'] = response.css('span.years_active::text').get()
item['ethnicity'] = response.css('span.ethnicity::text').get()
item['nationality'] = response.css('span.nationality::text').get()
item['hair_colors'] = response.css('span.hair_colors::text').get()
item['eye_color'] = response.css('span.eye_color::text').get()
item['height'] = response.css('span.height::text').get()
item['weight'] = response.css('span.weight::text').get()
item['measurements'] = response.css('span.measurements::text').get()
item['tattoos'] = response.css('span.tattoos::text').get()
item['piercings'] = response.css('span.piercings::text').get()
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
item['performer_aka'] = response.css('span.performer_aka::text').getall()
yield item
def parse_movie_detail_page(self, response):
item = IAFDMovieDetailItem()
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
item['href'] = response.url
# 解析其他详细信息,根据实际页面结构修改
yield item