This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
2025-07-31 11:27:24 +08:00

418 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import scrapy
import re
import sys
from urllib.parse import urljoin, quote_plus
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import IafdDistributorsItem, IafdMetaEthnicItem, IafdMoviesItem, IafdPerformersItem, IafdStudiosItem
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
from scrapy_proj.spiders.parser.iafd_parser import common_parser
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
db_tools = IAFDDBHandler()
class IAFDSpider(BaseSpider):
name = SPIDER_NAME_IAFD
allowed_domains = ["iafd.com", "www.iafd.com"]
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
distributors_list_url = f'{host_url}/distrib.asp'
studios_list_url = f"{host_url}/studio.asp"
ethnic_list_url = f'{host_url}/advsearch.asp'
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update_mode = True if mod and mod.lower() == 'update' else False
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
self.cmd_astro = 'astro'
self.cmd_birth = 'birth'
self.cmd_ethnic = 'ethnic'
self.cmd_dist = 'dist'
self.cmd_stu = 'stu'
self.cmd_performers = 'performers'
self.cmd_movies = 'movies'
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
if cmd and cmd != '':
self.cmd_list = cmd.split(',')
self.existed_actors = {}
self.existed_movies = {}
self.load_existed_actors()
self.load_existed_movies()
self.requested_url = set()
# 入口函数,由基类的方法触发
def custom_start_requests(self):
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
self.crawler.stats.set_value(f"{self.name}/ethnic_pages", 0)
self.crawler.stats.set_value(f"{self.name}/dist_pages", 0)
self.crawler.stats.set_value(f"{self.name}/stu_pages", 0)
self.crawler.stats.set_value(f"{self.name}/4xx_cnt", 0)
self.crawler.stats.set_value(f"{self.name}/5xx_cnt", 0)
self.crawler.stats.set_value(f"{self.name}/other_cnt", 0)
# 根据命令字执行
if self.cmd_astro in self.cmd_list:
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
for req in self.start_astro():
yield req # 将子函数的 Request 传递给框架
# 按生日获取演员列表
if self.cmd_birth in self.cmd_list:
for req in self.start_birth():
yield req # 将子函数的 Request 传递给框架
# 获取人种列表
if self.cmd_ethnic in self.cmd_list:
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
# 获取 distributors 列表
if self.cmd_dist in self.cmd_list:
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
# 获取 studios 列表
if self.cmd_stu in self.cmd_list:
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
query_args = {}
if self.debug:
query_args['limit'] = 5
if self.update_mode:
query_args['is_full_data__in'] = [0,404]
# 读取待更新的演员列表
if self.cmd_performers in self.cmd_list:
actors = db_tools.get_performers(**query_args)
self.crawler.stats.set_value(f"{self.name}/actor_all", len(actors) if actors else 0)
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
if actors:
for item in actors:
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
# 读取待更新的影片列表
if self.cmd_movies in self.cmd_list:
movies = db_tools.get_movies(**query_args)
self.crawler.stats.set_value(f"{self.name}/movies_all", len(movies) if movies else 0)
self.crawler.stats.set_value(f"{self.name}/movies_done", 0)
if movies:
for item in movies:
href = item.get('href', '')
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
def start_astro(self):
# 按星座获取演员列表
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
def start_birth(self):
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
async def start(self):
# 调用原有 start_requests 方法
async for request in super().start():
yield request
# 获得列表,查询详情
def parse_astro_page(self, response):
astro = response.meta.get('astro', '')
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
else:
self._handle_invalid_response(response, page='astro')
# 获得列表,查询详情
def parse_birth_page(self, response):
month = response.meta['month']
day = response.meta['day']
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
else:
self._handle_invalid_response(response, page='birth')
# 获得列表,查询详情
def parse_ethnic_list_page(self, response):
div_root = response.css('select#ethnicity1')
if div_root:
options = div_root.css('option')
for option in options:
href = option.attrib.get('value')
text = option.css('::text').get().strip()
if href and href.lower() != 'none':
ethnic_url = urljoin(response.url , href)
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
item = IafdMetaEthnicItem()
item['name'] = text
item['href'] = ethnic_url
yield item
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
else:
self._handle_invalid_response(response, page='ethnic_list')
# 获得列表,查询详情
def parse_ethnic_page(self, response):
ethnic = response.meta['ethnic']
depth = response.meta.get('depth', 1)
if self.debug and depth>=3:
self.logger.debug(f"debug mode, stop next page. ethnic:{ethnic}, url: {response.url}")
return
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
self.crawler.stats.inc_value(f"{self.name}/ethnic_pages")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
if next_url:
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic, 'depth':depth+1})
else:
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
else:
self._handle_invalid_response(response, page='ethnic')
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
if select_element:
options = select_element.css('option')
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
item = IafdDistributorsItem()
item['name'] = text
item['href'] = dis_url
yield item
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
else:
self._handle_invalid_response(response, page='dist_list')
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
if select_element:
options = select_element.css('option')
for option in options:
value = option.attrib.get('value')
text = option.css('::text').get().strip()
dis_url = f"{self.host_url}/studio.rme/studio={value}"
item = IafdStudiosItem()
item['name'] = text
item['href'] = dis_url
yield item
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
else:
self._handle_invalid_response(response, page='stu_list')
def parse_stu_dist_page(self, response):
list_type = response.meta.get('list_type', '')
data, next_url = common_parser(html=response.text, page=list_type)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
self.crawler.stats.inc_value(f"{self.name}/{list_type}_pages")
for movie in data:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
else:
self._handle_invalid_response(response, page='dist_stu')
# 统一处理发起影片查询的请求
def _create_performer_request(self, href, name):
if href == '':
return
if is_valid_url(href):
if self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/actor_all")
yield scrapy.Request(href,
callback=self.parse_person_detail_page,
meta={'name': name, 'item_type':'movie'}
)
else:
self.logger.warning(f"wrong url. {href}, ignore...")
# 统一处理发起影片查询的请求
def _create_movie_request(self, href, title):
if href == '':
return
if is_valid_url(href):
if self.need_update_movie(href) and self._can_request(href):
self.crawler.stats.inc_value(f"{self.name}/movie_all")
yield scrapy.Request(href,
callback=self.parse_movie_detail_page,
meta={'title': title, 'item_type':'movie', 'cache':True}
)
else:
self.logger.warning(f"wrong url. {href}, ignore...")
# 演员详情页解析和处理
def parse_person_detail_page(self, response):
data = common_parser(html=response.text, page='actor', url=response.url)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.crawler.stats.inc_value(f"{self.name}/actor_done")
item = IafdPerformersItem()
item['name'] = response.meta.get('name', '')
item['href'] = response.url
item['is_full_data'] = 1
for k, v in data.items():
if k in item.fields:
item[k] = v
yield item
# 处理影片列表
for role, movies in data.get('credits', {}).items():
if movies:
for item in movies:
yield from self._create_movie_request(href=item['href'], title=item['title'])
else:
self._handle_invalid_response(response, page='actor')
# 影片详情页解析和处理
def parse_movie_detail_page(self, response):
title = response.meta.get('title', '')
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
if data:
self.logger.debug(f"fetched data from {response.url}, data: {data}")
self.crawler.stats.inc_value(f"{self.name}/movie_done")
item = IafdMoviesItem()
item['is_full_data'] = 1
for k, v in data.items():
if k in item.fields:
item[k] = v
yield item
# 处理各种链接
link_url = data.get('DistributorHref', '')
if is_valid_url(link_url) and self._can_request(link_url):
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
link_url = data.get('StudioHref', '')
if is_valid_url(link_url) and self._can_request(link_url):
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
link_url = data.get('DirectorHref', '')
yield from self._create_performer_request(href=link_url, name=data.get('Director'))
for director in data.get('Directors', []):
yield from self._create_performer_request(href=director['href'], name=director['name'])
else:
self._handle_invalid_response(response, page='movie')
# 统一判断并处理异常
def _handle_invalid_response(self, response, page=None):
if response.status in [200]:
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
elif response.status in [500, 502, 503, 504, 521, 522, 524]:
self.logger.error(f"get 5xx page. url: {response.url}")
self.crawler.stats.inc_value(f"{self.name}/5xx_cnt")
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
if page:
if page == 'actor':
item = IafdPerformersItem()
item['href'] = response.url
item['name'] = response.meta.get('name', '')
item['is_full_data'] = 404
yield item
elif page == 'movie':
item = IafdMoviesItem()
item['href'] = response.url
item['title'] = response.meta.get('title', '')
item['is_full_data'] = 404
yield item
def load_existed_actors(self):
query_args = {}
rows = db_tools.get_performers(**query_args)
if rows:
for item in rows:
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
else:
self.logger.warning(f"get_performers empty. query args: {query_args}")
def load_existed_movies(self):
query_args = {}
rows = db_tools.get_movies(**query_args)
if rows:
for item in rows:
self.existed_movies[item['href']] = item['is_full_data']
else:
self.logger.warning(f"get_movies empty. query args: {query_args}")
# 内存缓存也可以改为查询db
def need_update_movie(self, href):
return not (href in self.existed_movies and self.existed_movies[href] >0)
# 内存缓存也可以改为查询db
def need_update_actor(self, href, movies_cnt):
if href not in self.existed_actors:
return True
data = self.existed_actors[href]
if data['is_full_data'] <=0 :
return True
if data['movies_cnt'] < movies_cnt:
return True
return False
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
def acc_movie_to_existed(self, href, is_full_data=1):
self.existed_movies[href] = is_full_data
def _can_request(self, href):
if href in self.requested_url:
return False
if self.debug: # 某些条件下限定url的发起次数
keys = ['person.rme', 'title.rme']
for key in keys:
count = 0
for url in self.requested_url:
if key.lower() in url.lower():
count+=1
if count >=2 and key in href.lower():
return False
self.requested_url.add(href)
return True