418 lines
18 KiB
Python
418 lines
18 KiB
Python
import scrapy
|
||
import re
|
||
import sys
|
||
from urllib.parse import urljoin, quote_plus
|
||
from scrapy_proj.spiders.base_spider import BaseSpider
|
||
from scrapy_proj.items import IafdDistributorsItem, IafdMetaEthnicItem, IafdMoviesItem, IafdPerformersItem, IafdStudiosItem
|
||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||
from scrapy_proj.spiders.parser.iafd_parser import common_parser
|
||
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
|
||
|
||
db_tools = IAFDDBHandler()
|
||
|
||
class IAFDSpider(BaseSpider):
|
||
name = SPIDER_NAME_IAFD
|
||
allowed_domains = ["iafd.com", "www.iafd.com"]
|
||
|
||
host_url = "https://www.iafd.com"
|
||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||
distributors_list_url = f'{host_url}/distrib.asp'
|
||
studios_list_url = f"{host_url}/studio.asp"
|
||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||
|
||
def __init__(self, debug='false', cmd='', mod='all', *args, **kwargs):
|
||
super().__init__(*args, **kwargs)
|
||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||
self.update_mode = True if mod and mod.lower() == 'update' else False
|
||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||
|
||
self.cmd_astro = 'astro'
|
||
self.cmd_birth = 'birth'
|
||
self.cmd_ethnic = 'ethnic'
|
||
self.cmd_dist = 'dist'
|
||
self.cmd_stu = 'stu'
|
||
self.cmd_performers = 'performers'
|
||
self.cmd_movies = 'movies'
|
||
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
|
||
if cmd and cmd != '':
|
||
self.cmd_list = cmd.split(',')
|
||
|
||
self.existed_actors = {}
|
||
self.existed_movies = {}
|
||
self.load_existed_actors()
|
||
self.load_existed_movies()
|
||
self.requested_url = set()
|
||
|
||
# 入口函数,由基类的方法触发
|
||
def custom_start_requests(self):
|
||
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/ethnic_pages", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/dist_pages", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/stu_pages", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/4xx_cnt", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/5xx_cnt", 0)
|
||
self.crawler.stats.set_value(f"{self.name}/other_cnt", 0)
|
||
|
||
# 根据命令字执行
|
||
if self.cmd_astro in self.cmd_list:
|
||
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
|
||
for req in self.start_astro():
|
||
yield req # 将子函数的 Request 传递给框架
|
||
|
||
# 按生日获取演员列表
|
||
if self.cmd_birth in self.cmd_list:
|
||
for req in self.start_birth():
|
||
yield req # 将子函数的 Request 传递给框架
|
||
|
||
# 获取人种列表
|
||
if self.cmd_ethnic in self.cmd_list:
|
||
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
|
||
|
||
# 获取 distributors 列表
|
||
if self.cmd_dist in self.cmd_list:
|
||
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
|
||
|
||
# 获取 studios 列表
|
||
if self.cmd_stu in self.cmd_list:
|
||
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
|
||
|
||
query_args = {}
|
||
if self.debug:
|
||
query_args['limit'] = 5
|
||
if self.update_mode:
|
||
query_args['is_full_data__in'] = [0,404]
|
||
|
||
# 读取待更新的演员列表
|
||
if self.cmd_performers in self.cmd_list:
|
||
actors = db_tools.get_performers(**query_args)
|
||
self.crawler.stats.set_value(f"{self.name}/actor_all", len(actors) if actors else 0)
|
||
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||
if actors:
|
||
for item in actors:
|
||
href = item.get('href', '')
|
||
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
||
self.logger.info(f"fetch from db. item: {item}")
|
||
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt, 'item_type':'actor'})
|
||
|
||
# 读取待更新的影片列表
|
||
if self.cmd_movies in self.cmd_list:
|
||
movies = db_tools.get_movies(**query_args)
|
||
self.crawler.stats.set_value(f"{self.name}/movies_all", len(movies) if movies else 0)
|
||
self.crawler.stats.set_value(f"{self.name}/movies_done", 0)
|
||
if movies:
|
||
for item in movies:
|
||
href = item.get('href', '')
|
||
self.logger.info(f"fetch from db. item: {item}")
|
||
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', ''), 'item_type':'movie'})
|
||
|
||
|
||
def start_astro(self):
|
||
# 按星座获取演员列表
|
||
for astro in self.astro_list:
|
||
url = self.astr_base_url + astro
|
||
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
||
|
||
def start_birth(self):
|
||
for month in range(1, 13):
|
||
for day in range(1, 32):
|
||
url = self.birth_base_url.format(month=month, day=day)
|
||
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
||
|
||
async def start(self):
|
||
# 调用原有 start_requests 方法
|
||
async for request in super().start():
|
||
yield request
|
||
|
||
# 获得列表,查询详情
|
||
def parse_astro_page(self, response):
|
||
astro = response.meta.get('astro', '')
|
||
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||
for item in data:
|
||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||
else:
|
||
self._handle_invalid_response(response, page='astro')
|
||
|
||
# 获得列表,查询详情
|
||
def parse_birth_page(self, response):
|
||
month = response.meta['month']
|
||
day = response.meta['day']
|
||
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||
for item in data:
|
||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||
else:
|
||
self._handle_invalid_response(response, page='birth')
|
||
|
||
# 获得列表,查询详情
|
||
def parse_ethnic_list_page(self, response):
|
||
div_root = response.css('select#ethnicity1')
|
||
if div_root:
|
||
options = div_root.css('option')
|
||
for option in options:
|
||
href = option.attrib.get('value')
|
||
text = option.css('::text').get().strip()
|
||
if href and href.lower() != 'none':
|
||
ethnic_url = urljoin(response.url , href)
|
||
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
|
||
item = IafdMetaEthnicItem()
|
||
item['name'] = text
|
||
item['href'] = ethnic_url
|
||
yield item
|
||
|
||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
|
||
else:
|
||
self._handle_invalid_response(response, page='ethnic_list')
|
||
|
||
# 获得列表,查询详情
|
||
def parse_ethnic_page(self, response):
|
||
ethnic = response.meta['ethnic']
|
||
depth = response.meta.get('depth', 1)
|
||
if self.debug and depth>=3:
|
||
self.logger.debug(f"debug mode, stop next page. ethnic:{ethnic}, url: {response.url}")
|
||
return
|
||
|
||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||
self.crawler.stats.inc_value(f"{self.name}/ethnic_pages")
|
||
for item in data:
|
||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||
|
||
if next_url:
|
||
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic, 'depth':depth+1})
|
||
else:
|
||
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||
else:
|
||
self._handle_invalid_response(response, page='ethnic')
|
||
|
||
def parse_distributors_list_page(self, response):
|
||
select_element = response.css('select[name="Distrib"]')
|
||
if select_element:
|
||
options = select_element.css('option')
|
||
for option in options:
|
||
value = option.attrib.get('value')
|
||
text = option.css('::text').get().strip()
|
||
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
|
||
item = IafdDistributorsItem()
|
||
item['name'] = text
|
||
item['href'] = dis_url
|
||
|
||
yield item
|
||
|
||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||
else:
|
||
self._handle_invalid_response(response, page='dist_list')
|
||
|
||
def parse_studios_list_page(self, response):
|
||
select_element = response.css('select[name="Studio"]')
|
||
if select_element:
|
||
options = select_element.css('option')
|
||
for option in options:
|
||
value = option.attrib.get('value')
|
||
text = option.css('::text').get().strip()
|
||
dis_url = f"{self.host_url}/studio.rme/studio={value}"
|
||
item = IafdStudiosItem()
|
||
item['name'] = text
|
||
item['href'] = dis_url
|
||
yield item
|
||
|
||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||
else:
|
||
self._handle_invalid_response(response, page='stu_list')
|
||
|
||
def parse_stu_dist_page(self, response):
|
||
list_type = response.meta.get('list_type', '')
|
||
data, next_url = common_parser(html=response.text, page=list_type)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||
self.crawler.stats.inc_value(f"{self.name}/{list_type}_pages")
|
||
for movie in data:
|
||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||
else:
|
||
self._handle_invalid_response(response, page='dist_stu')
|
||
|
||
# 统一处理发起影片查询的请求
|
||
def _create_performer_request(self, href, name):
|
||
if href == '':
|
||
return
|
||
if is_valid_url(href):
|
||
if self._can_request(href):
|
||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||
yield scrapy.Request(href,
|
||
callback=self.parse_person_detail_page,
|
||
meta={'name': name, 'item_type':'movie'}
|
||
)
|
||
else:
|
||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||
|
||
# 统一处理发起影片查询的请求
|
||
def _create_movie_request(self, href, title):
|
||
if href == '':
|
||
return
|
||
if is_valid_url(href):
|
||
if self.need_update_movie(href) and self._can_request(href):
|
||
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
||
yield scrapy.Request(href,
|
||
callback=self.parse_movie_detail_page,
|
||
meta={'title': title, 'item_type':'movie', 'cache':True}
|
||
)
|
||
else:
|
||
self.logger.warning(f"wrong url. {href}, ignore...")
|
||
|
||
# 演员详情页解析和处理
|
||
def parse_person_detail_page(self, response):
|
||
data = common_parser(html=response.text, page='actor', url=response.url)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||
item = IafdPerformersItem()
|
||
item['name'] = response.meta.get('name', '')
|
||
item['href'] = response.url
|
||
item['is_full_data'] = 1
|
||
for k, v in data.items():
|
||
if k in item.fields:
|
||
item[k] = v
|
||
|
||
yield item
|
||
|
||
# 处理影片列表
|
||
for role, movies in data.get('credits', {}).items():
|
||
if movies:
|
||
for item in movies:
|
||
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||
else:
|
||
self._handle_invalid_response(response, page='actor')
|
||
|
||
# 影片详情页解析和处理
|
||
def parse_movie_detail_page(self, response):
|
||
title = response.meta.get('title', '')
|
||
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
||
if data:
|
||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
||
item = IafdMoviesItem()
|
||
item['is_full_data'] = 1
|
||
for k, v in data.items():
|
||
if k in item.fields:
|
||
item[k] = v
|
||
yield item
|
||
|
||
# 处理各种链接
|
||
link_url = data.get('DistributorHref', '')
|
||
if is_valid_url(link_url) and self._can_request(link_url):
|
||
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||
|
||
link_url = data.get('StudioHref', '')
|
||
if is_valid_url(link_url) and self._can_request(link_url):
|
||
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||
|
||
link_url = data.get('DirectorHref', '')
|
||
yield from self._create_performer_request(href=link_url, name=data.get('Director'))
|
||
|
||
for director in data.get('Directors', []):
|
||
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||
|
||
else:
|
||
self._handle_invalid_response(response, page='movie')
|
||
|
||
# 统一判断并处理异常
|
||
def _handle_invalid_response(self, response, page=None):
|
||
if response.status in [200]:
|
||
if "invalid or outdated page" in response.text.lower():
|
||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
|
||
else:
|
||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
|
||
|
||
elif response.status in [404, 403]:
|
||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
|
||
|
||
elif response.status in [500, 502, 503, 504, 521, 522, 524]:
|
||
self.logger.error(f"get 5xx page. url: {response.url}")
|
||
self.crawler.stats.inc_value(f"{self.name}/5xx_cnt")
|
||
else:
|
||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
|
||
|
||
if page:
|
||
if page == 'actor':
|
||
item = IafdPerformersItem()
|
||
item['href'] = response.url
|
||
item['name'] = response.meta.get('name', '')
|
||
item['is_full_data'] = 404
|
||
yield item
|
||
elif page == 'movie':
|
||
item = IafdMoviesItem()
|
||
item['href'] = response.url
|
||
item['title'] = response.meta.get('title', '')
|
||
item['is_full_data'] = 404
|
||
yield item
|
||
|
||
def load_existed_actors(self):
|
||
query_args = {}
|
||
rows = db_tools.get_performers(**query_args)
|
||
if rows:
|
||
for item in rows:
|
||
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
|
||
else:
|
||
self.logger.warning(f"get_performers empty. query args: {query_args}")
|
||
|
||
|
||
def load_existed_movies(self):
|
||
query_args = {}
|
||
rows = db_tools.get_movies(**query_args)
|
||
if rows:
|
||
for item in rows:
|
||
self.existed_movies[item['href']] = item['is_full_data']
|
||
else:
|
||
self.logger.warning(f"get_movies empty. query args: {query_args}")
|
||
|
||
# 内存缓存,也可以改为查询db
|
||
def need_update_movie(self, href):
|
||
return not (href in self.existed_movies and self.existed_movies[href] >0)
|
||
|
||
# 内存缓存,也可以改为查询db
|
||
def need_update_actor(self, href, movies_cnt):
|
||
if href not in self.existed_actors:
|
||
return True
|
||
data = self.existed_actors[href]
|
||
if data['is_full_data'] <=0 :
|
||
return True
|
||
if data['movies_cnt'] < movies_cnt:
|
||
return True
|
||
|
||
return False
|
||
|
||
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
|
||
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
|
||
|
||
def acc_movie_to_existed(self, href, is_full_data=1):
|
||
self.existed_movies[href] = is_full_data
|
||
|
||
def _can_request(self, href):
|
||
if href in self.requested_url:
|
||
return False
|
||
|
||
if self.debug: # 某些条件下,限定url的发起次数
|
||
keys = ['person.rme', 'title.rme']
|
||
for key in keys:
|
||
count = 0
|
||
for url in self.requested_url:
|
||
if key.lower() in url.lower():
|
||
count+=1
|
||
if count >=2 and key in href.lower():
|
||
return False
|
||
|
||
self.requested_url.add(href)
|
||
return True |