259 lines
12 KiB
Python
259 lines
12 KiB
Python
import scrapy
|
|
import re
|
|
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
|
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
|
|
|
|
db_tools = IAFDQuery()
|
|
|
|
class IAFDSpider(scrapy.Spider):
|
|
name = "iafd"
|
|
allowed_domains = ["iafd.com"]
|
|
|
|
host_url = "https://www.iafd.com"
|
|
astr_base_url = f"{host_url}/astrology.rme/sign="
|
|
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
|
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
|
distributors_list_url = f'{host_url}/distrib.asp'
|
|
studios_list_url = f"{host_url}/studio.asp"
|
|
ethnic_list_url = f'{host_url}/advsearch.asp'
|
|
|
|
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
|
self.cmd_str = cmd
|
|
self.update = int(update)
|
|
self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}")
|
|
|
|
self.cmd_astro = 'astro'
|
|
self.cmd_birth = 'birth'
|
|
self.cmd_ethnic = 'ethnic'
|
|
self.cmd_dist = 'dist'
|
|
self.cmd_stu = 'stu'
|
|
self.cmd_performers = 'performers'
|
|
self.cmd_movies = 'movies'
|
|
self.cmd_list = self.cmd_str.split(',')
|
|
if len(self.cmd_list) == 0 :
|
|
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
|
|
|
|
def start_requests(self):
|
|
# 根据命令字执行
|
|
if self.cmd_astro in self.cmd_list:
|
|
self.start_astro()
|
|
|
|
# 按生日获取演员列表
|
|
if self.cmd_birth in self.cmd_list:
|
|
self.start_birth()
|
|
|
|
# 获取人种列表
|
|
if self.cmd_ethnic in self.cmd_list:
|
|
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
|
|
|
|
# 获取 distributors 列表
|
|
if self.cmd_dist in self.cmd_list:
|
|
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
|
|
|
|
# 获取 studios 列表
|
|
if self.cmd_stu in self.cmd_list:
|
|
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
|
|
|
|
query_args = {}
|
|
if self.debug:
|
|
query_args['limit'] = 5
|
|
if self.update == 0:
|
|
query_args['is_full_data'] = 0
|
|
|
|
# 读取待更新的演员列表
|
|
if self.cmd_performers in self.cmd_list:
|
|
actors = db_tools.get_performers(**query_args)
|
|
if actors:
|
|
for item in actors:
|
|
href = item.get('href', '')
|
|
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
|
self.logger.info(f"fetch from db. item: {item}")
|
|
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
|
|
|
|
# 读取待更新的影片列表
|
|
if self.cmd_movies in self.cmd_list:
|
|
movies = db_tools.get_movies(**query_args)
|
|
if movies:
|
|
for item in movies:
|
|
href = item.get('href', '')
|
|
self.logger.info(f"fetch from db. item: {item}")
|
|
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
|
|
|
|
|
|
def start_astro(self):
|
|
# 按星座获取演员列表
|
|
for astro in self.astro_list:
|
|
url = self.astr_base_url + astro
|
|
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
|
if self.debug:
|
|
break
|
|
|
|
def start_birth(self):
|
|
for month in range(1, 13):
|
|
for day in range(1, 32):
|
|
url = self.birth_base_url.format(month=month, day=day)
|
|
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
|
if self.debug:
|
|
break
|
|
|
|
async def start(self):
|
|
# 调用原有 start_requests 方法
|
|
async for request in super().start():
|
|
yield request
|
|
|
|
def parse_astro_page(self, response):
|
|
astro = response.meta['astro']
|
|
astro_div = response.css('div#astro')
|
|
if astro_div:
|
|
birth_date = None
|
|
for elem in astro_div.css('*'):
|
|
if elem.css('h3.astroday'):
|
|
birth_date = elem.css('h3.astroday::text').get().strip()
|
|
elif elem.css('div.perficon'):
|
|
a_tag = elem.css('a')
|
|
if a_tag:
|
|
href = self.host_url + a_tag.attrib['href']
|
|
name = a_tag.css('span.perfname::text').get()
|
|
if name:
|
|
item = IAFDPersonItem()
|
|
item['name'] = name
|
|
item['href'] = href
|
|
item['from_astro_list'] = 1
|
|
item['from_birth_list'] = 0
|
|
item['from_ethnic_list'] = 0
|
|
item['from_movie_list'] = 0
|
|
yield item
|
|
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
|
|
|
def parse_birth_page(self, response):
|
|
month = response.meta['month']
|
|
day = response.meta['day']
|
|
datarows = response.css('div.col-sm-12.col-lg-9')
|
|
if datarows:
|
|
rows = datarows[0].css('div.col-sm-4')
|
|
for row in rows:
|
|
link_tag = row.css('a')
|
|
person = link_tag.css('::text').get().strip() if link_tag else ''
|
|
href = self.host_url + link_tag.attrib['href'] if link_tag else ''
|
|
|
|
item = IAFDPersonItem()
|
|
item['name'] = person
|
|
item['href'] = href
|
|
item['from_astro_list'] = 0
|
|
item['from_birth_list'] = 1
|
|
item['from_ethnic_list'] = 0
|
|
item['from_movie_list'] = 0
|
|
yield item
|
|
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
|
|
|
def parse_ethnic_list_page(self, response):
|
|
div_root = response.css('select#ethnicity1')
|
|
if div_root:
|
|
options = div_root.css('option')
|
|
for option in options:
|
|
href = option.attrib.get('value')
|
|
text = option.css('::text').get().strip()
|
|
if href and href.lower() != 'none':
|
|
ethnic_url = self.host_url + href
|
|
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
|
if self.debug:
|
|
break
|
|
|
|
def parse_ethnic_page(self, response):
|
|
ethnic = response.meta['ethnic']
|
|
rows = response.css('div.row.headshotrow')
|
|
for row in rows:
|
|
cols = row.css('div.col-lg-2.col-md-3.col-sm-4.col-xs-6')
|
|
for col in cols:
|
|
link_tag = col.css('a')
|
|
img_tag = col.css('div.pictag')
|
|
if link_tag and img_tag:
|
|
href = self.host_url + link_tag.attrib['href']
|
|
person = img_tag.css('::text').get().strip()
|
|
|
|
item = IAFDPersonItem()
|
|
item['name'] = person
|
|
item['href'] = href
|
|
item['from_astro_list'] = 0
|
|
item['from_birth_list'] = 0
|
|
item['from_ethnic_list'] = 1
|
|
item['from_movie_list'] = 0
|
|
yield item
|
|
#yield scrapy.Request(href, callback=self.parse_person_detail_page)
|
|
|
|
next_page = response.css('a[rel="next"]')
|
|
if next_page:
|
|
next_url = self.host_url + next_page.attrib['href']
|
|
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': ethnic})
|
|
|
|
def parse_distributors_list_page(self, response):
|
|
select_element = response.css('select[name="Distrib"]')
|
|
if select_element:
|
|
options = select_element.css('option')
|
|
for option in options:
|
|
value = option.attrib.get('value')
|
|
text = option.css('::text').get().strip()
|
|
dis_url = self.host_url + f"/distrib.rme/distrib={value}"
|
|
item = IAFDMovieItem()
|
|
item['title'] = text
|
|
item['href'] = dis_url
|
|
item['release_year'] = 0
|
|
item['from_performer_list'] = 0
|
|
item['from_dist_list'] = 1
|
|
item['from_stu_list'] = 0
|
|
yield item
|
|
yield scrapy.Request(dis_url, callback=self.parse_movie_detail_page)
|
|
|
|
def parse_studios_list_page(self, response):
|
|
select_element = response.css('select[name="Studio"]')
|
|
if select_element:
|
|
options = select_element.css('option')
|
|
for option in options:
|
|
value = option.attrib.get('value')
|
|
text = option.css('::text').get().strip()
|
|
stu_url = self.host_url + f"/studio.rme/studio={value}"
|
|
item = IAFDMovieItem()
|
|
item['title'] = text
|
|
item['href'] = stu_url
|
|
item['release_year'] = 0
|
|
item['from_performer_list'] = 0
|
|
item['from_dist_list'] = 0
|
|
item['from_stu_list'] = 1
|
|
yield item
|
|
yield scrapy.Request(stu_url, callback=self.parse_movie_detail_page)
|
|
|
|
def parse_person_detail_page(self, response):
|
|
item = IAFDPersonDetailItem()
|
|
item['href'] = response.url
|
|
item['person'] = response.css('h1::text').get() # 假设姓名在 h1 标签中
|
|
# 解析其他详细信息,根据实际页面结构修改
|
|
item['gender'] = response.css('span.gender::text').get()
|
|
item['birthday'] = response.css('span.birthday::text').get()
|
|
item['astrology'] = response.css('span.astrology::text').get()
|
|
item['birthplace'] = response.css('span.birthplace::text').get()
|
|
item['years_active'] = response.css('span.years_active::text').get()
|
|
item['ethnicity'] = response.css('span.ethnicity::text').get()
|
|
item['nationality'] = response.css('span.nationality::text').get()
|
|
item['hair_colors'] = response.css('span.hair_colors::text').get()
|
|
item['eye_color'] = response.css('span.eye_color::text').get()
|
|
item['height'] = response.css('span.height::text').get()
|
|
item['weight'] = response.css('span.weight::text').get()
|
|
item['measurements'] = response.css('span.measurements::text').get()
|
|
item['tattoos'] = response.css('span.tattoos::text').get()
|
|
item['piercings'] = response.css('span.piercings::text').get()
|
|
item['movies_cnt'] = response.css('span.movies_cnt::text').get()
|
|
item['vixen_cnt'] = response.css('span.vixen_cnt::text').get()
|
|
item['blacked_cnt'] = response.css('span.blacked_cnt::text').get()
|
|
item['tushy_cnt'] = response.css('span.tushy_cnt::text').get()
|
|
item['x_art_cnt'] = response.css('span.x_art_cnt::text').get()
|
|
item['performer_aka'] = response.css('span.performer_aka::text').getall()
|
|
yield item
|
|
|
|
def parse_movie_detail_page(self, response):
|
|
item = IAFDMovieDetailItem()
|
|
item['title'] = response.css('h1::text').get() # 假设标题在 h1 标签中
|
|
item['href'] = response.url
|
|
# 解析其他详细信息,根据实际页面结构修改
|
|
yield item |