modify scripts
This commit is contained in:
@ -136,6 +136,7 @@ if [ "${PERIOD}" = "--monthly" ]; then
|
|||||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||||
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
|
register_spider "javhd" "scrapy crawl javhd -a mod='update' "
|
||||||
register_spider "lord" "scrapy crawl lord -a mod='update' "
|
register_spider "lord" "scrapy crawl lord -a mod='update' "
|
||||||
|
register_spider "javbus" "scrapy crawl javbus -a cmd='actors' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ "
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,7 @@ home_dir = os.path.expanduser("~")
|
|||||||
global_share_data_dir = f'{home_dir}/sharedata'
|
global_share_data_dir = f'{home_dir}/sharedata'
|
||||||
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
|
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
|
||||||
shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
|
shared_db_path = f"{global_share_data_dir}/sqlite/shared.db"
|
||||||
|
test_db_path = f"{global_share_data_dir}/sqlite/test.db"
|
||||||
|
|
||||||
# 单例元类
|
# 单例元类
|
||||||
class SingletonMeta(type):
|
class SingletonMeta(type):
|
||||||
|
|||||||
@ -35,55 +35,6 @@ class Sis001Item(scrapy.Item):
|
|||||||
size_gb = scrapy.Field()
|
size_gb = scrapy.Field()
|
||||||
update_date = scrapy.Field()
|
update_date = scrapy.Field()
|
||||||
|
|
||||||
class IAFDPersonItem(scrapy.Item):
|
|
||||||
item_type = comm.ITEM_TYPE_ACTOR_INDEX
|
|
||||||
name = scrapy.Field()
|
|
||||||
href = scrapy.Field()
|
|
||||||
from_astro_list = scrapy.Field()
|
|
||||||
from_birth_list = scrapy.Field()
|
|
||||||
from_ethnic_list = scrapy.Field()
|
|
||||||
from_movie_list = scrapy.Field()
|
|
||||||
|
|
||||||
class IAFDMovieItem(scrapy.Item):
|
|
||||||
item_type = comm.ITEM_TYPE_MOVIE_INDEX
|
|
||||||
title = scrapy.Field()
|
|
||||||
href = scrapy.Field()
|
|
||||||
release_year = scrapy.Field()
|
|
||||||
from_performer_list = scrapy.Field()
|
|
||||||
from_dist_list = scrapy.Field()
|
|
||||||
from_stu_list = scrapy.Field()
|
|
||||||
|
|
||||||
class IAFDPersonDetailItem(scrapy.Item):
|
|
||||||
item_type = comm.ITEM_TYPE_ACTOR_DETAIL
|
|
||||||
href = scrapy.Field()
|
|
||||||
person = scrapy.Field()
|
|
||||||
gender = scrapy.Field()
|
|
||||||
birthday = scrapy.Field()
|
|
||||||
astrology = scrapy.Field()
|
|
||||||
birthplace = scrapy.Field()
|
|
||||||
years_active = scrapy.Field()
|
|
||||||
ethnicity = scrapy.Field()
|
|
||||||
nationality = scrapy.Field()
|
|
||||||
hair_colors = scrapy.Field()
|
|
||||||
eye_color = scrapy.Field()
|
|
||||||
height = scrapy.Field()
|
|
||||||
weight = scrapy.Field()
|
|
||||||
measurements = scrapy.Field()
|
|
||||||
tattoos = scrapy.Field()
|
|
||||||
piercings = scrapy.Field()
|
|
||||||
movies_cnt = scrapy.Field()
|
|
||||||
vixen_cnt = scrapy.Field()
|
|
||||||
blacked_cnt = scrapy.Field()
|
|
||||||
tushy_cnt = scrapy.Field()
|
|
||||||
x_art_cnt = scrapy.Field()
|
|
||||||
performer_aka = scrapy.Field()
|
|
||||||
|
|
||||||
class IAFDMovieDetailItem(scrapy.Item):
|
|
||||||
item_type = comm.ITEM_TYPE_MOVIE_DETAIL
|
|
||||||
title = scrapy.Field()
|
|
||||||
href = scrapy.Field()
|
|
||||||
# 可以根据实际需求添加更多影片详情字段
|
|
||||||
|
|
||||||
class PBoxStuItem(scrapy.Item):
|
class PBoxStuItem(scrapy.Item):
|
||||||
item_type = scrapy.Field()
|
item_type = scrapy.Field()
|
||||||
label_id = scrapy.Field()
|
label_id = scrapy.Field()
|
||||||
@ -228,10 +179,12 @@ class IafdDistributorsItem(scrapy.Item):
|
|||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
parent_id = scrapy.Field()
|
parent_id = scrapy.Field()
|
||||||
details = scrapy.Field()
|
details = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdMetaEthnicItem(scrapy.Item):
|
class IafdMetaEthnicItem(scrapy.Item):
|
||||||
name = scrapy.Field()
|
name = scrapy.Field()
|
||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdMoviesItem(scrapy.Item):
|
class IafdMoviesItem(scrapy.Item):
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
@ -251,21 +204,35 @@ class IafdMoviesItem(scrapy.Item):
|
|||||||
from_performer_list = scrapy.Field()
|
from_performer_list = scrapy.Field()
|
||||||
from_dist_list = scrapy.Field()
|
from_dist_list = scrapy.Field()
|
||||||
from_stu_list = scrapy.Field()
|
from_stu_list = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
Directors = scrapy.Field()
|
||||||
|
Distributor = scrapy.Field()
|
||||||
|
DistributorHref = scrapy.Field()
|
||||||
|
Studio = scrapy.Field()
|
||||||
|
StudioHref = scrapy.Field()
|
||||||
|
Director = scrapy.Field()
|
||||||
|
DirectorHref = scrapy.Field()
|
||||||
|
Performers = scrapy.Field()
|
||||||
|
SceneBreakdowns = scrapy.Field()
|
||||||
|
AppearsIn = scrapy.Field()
|
||||||
|
|
||||||
class IafdMoviesAppersInItem(scrapy.Item):
|
class IafdMoviesAppersInItem(scrapy.Item):
|
||||||
movie_id = scrapy.Field()
|
movie_id = scrapy.Field()
|
||||||
appears_in_id = scrapy.Field()
|
appears_in_id = scrapy.Field()
|
||||||
gradation = scrapy.Field()
|
gradation = scrapy.Field()
|
||||||
notes = scrapy.Field()
|
notes = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdPerformerAliasesItem(scrapy.Item):
|
class IafdPerformerAliasesItem(scrapy.Item):
|
||||||
performer_id = scrapy.Field()
|
performer_id = scrapy.Field()
|
||||||
alias = scrapy.Field()
|
alias = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdPerformerUrlsItem(scrapy.Item):
|
class IafdPerformerUrlsItem(scrapy.Item):
|
||||||
performer_id = scrapy.Field()
|
performer_id = scrapy.Field()
|
||||||
position = scrapy.Field()
|
position = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdPerformersItem(scrapy.Item):
|
class IafdPerformersItem(scrapy.Item):
|
||||||
name = scrapy.Field()
|
name = scrapy.Field()
|
||||||
@ -299,18 +266,23 @@ class IafdPerformersItem(scrapy.Item):
|
|||||||
from_birth_list = scrapy.Field()
|
from_birth_list = scrapy.Field()
|
||||||
from_ethnic_list = scrapy.Field()
|
from_ethnic_list = scrapy.Field()
|
||||||
from_movie_list = scrapy.Field()
|
from_movie_list = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
credits = scrapy.Field()
|
||||||
|
performer_aka = scrapy.Field()
|
||||||
|
|
||||||
class IafdPerformersMoviesItem(scrapy.Item):
|
class IafdPerformersMoviesItem(scrapy.Item):
|
||||||
performer_id = scrapy.Field()
|
performer_id = scrapy.Field()
|
||||||
movie_id = scrapy.Field()
|
movie_id = scrapy.Field()
|
||||||
role = scrapy.Field()
|
role = scrapy.Field()
|
||||||
notes = scrapy.Field()
|
notes = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdStudiosItem(scrapy.Item):
|
class IafdStudiosItem(scrapy.Item):
|
||||||
name = scrapy.Field()
|
name = scrapy.Field()
|
||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
parent_id = scrapy.Field()
|
parent_id = scrapy.Field()
|
||||||
details = scrapy.Field()
|
details = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class IafdTaskLogItem(scrapy.Item):
|
class IafdTaskLogItem(scrapy.Item):
|
||||||
task_id = scrapy.Field()
|
task_id = scrapy.Field()
|
||||||
@ -321,6 +293,7 @@ class IafdTaskLogItem(scrapy.Item):
|
|||||||
total_distributors = scrapy.Field()
|
total_distributors = scrapy.Field()
|
||||||
total_studios = scrapy.Field()
|
total_studios = scrapy.Field()
|
||||||
task_status = scrapy.Field()
|
task_status = scrapy.Field()
|
||||||
|
# 以下为添加字段
|
||||||
|
|
||||||
class JavbusActorsItem(scrapy.Item):
|
class JavbusActorsItem(scrapy.Item):
|
||||||
ja_name = scrapy.Field()
|
ja_name = scrapy.Field()
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
# return item
|
# return item
|
||||||
import json
|
import json
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem, PBoxStuItem
|
from scrapy_proj.items import U001Item, Sis001Item, PBoxStuItem
|
||||||
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler
|
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler
|
||||||
|
|
||||||
class SQLitePipeline():
|
class SQLitePipeline():
|
||||||
|
|||||||
@ -6,6 +6,16 @@ from twisted.internet import reactor, defer, asyncioreactor
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
class BaseSpider(scrapy.Spider):
|
class BaseSpider(scrapy.Spider):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.requested_url = set()
|
||||||
|
|
||||||
|
# 记录本次任务已经发起的请求链接
|
||||||
|
def _can_request(self, href):
|
||||||
|
if href in self.requested_url:
|
||||||
|
return False
|
||||||
|
self.requested_url.add(href)
|
||||||
|
return True
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
"""统一处理请求生成,兼容不同入口点"""
|
"""统一处理请求生成,兼容不同入口点"""
|
||||||
# 如果定义了async start方法,使用它
|
# 如果定义了async start方法,使用它
|
||||||
|
|||||||
@ -3,11 +3,11 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
from urllib.parse import urljoin, quote_plus
|
from urllib.parse import urljoin, quote_plus
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
from scrapy_proj.items import IafdDistributorsItem, IafdMetaEthnicItem, IafdMoviesItem, IafdPerformersItem, IafdStudiosItem
|
||||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||||||
from scrapy_proj.spiders.parser.iafd_parser import common_parser
|
from scrapy_proj.spiders.parser.iafd_parser import common_parser
|
||||||
from scrapy_proj.utils.utils import pretty_json_simple
|
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
|
||||||
|
|
||||||
db_tools = IAFDDBHandler()
|
db_tools = IAFDDBHandler()
|
||||||
|
|
||||||
@ -40,8 +40,19 @@ class IAFDSpider(BaseSpider):
|
|||||||
if cmd and cmd != '':
|
if cmd and cmd != '':
|
||||||
self.cmd_list = cmd.split(',')
|
self.cmd_list = cmd.split(',')
|
||||||
|
|
||||||
|
self.existed_actors = {}
|
||||||
|
self.existed_movies = {}
|
||||||
|
self.load_existed_actors()
|
||||||
|
self.load_existed_movies()
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/actor_404", 0)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/movie_404", 0)
|
||||||
# 根据命令字执行
|
# 根据命令字执行
|
||||||
if self.cmd_astro in self.cmd_list:
|
if self.cmd_astro in self.cmd_list:
|
||||||
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
|
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
|
||||||
@ -117,59 +128,65 @@ class IAFDSpider(BaseSpider):
|
|||||||
async for request in super().start():
|
async for request in super().start():
|
||||||
yield request
|
yield request
|
||||||
|
|
||||||
|
# 获得列表,查询详情
|
||||||
def parse_astro_page(self, response):
|
def parse_astro_page(self, response):
|
||||||
astro = response.meta.get('astro', '')
|
astro = response.meta.get('astro', '')
|
||||||
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
|
data, next_url = common_parser(html=response.text, page='astro', astro=astro)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||||
|
for item in data:
|
||||||
|
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self.logger.warning(f"parse data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDPersonDetailItem()
|
|
||||||
#yield item
|
|
||||||
|
|
||||||
|
# 获得列表,查询详情
|
||||||
def parse_birth_page(self, response):
|
def parse_birth_page(self, response):
|
||||||
month = response.meta['month']
|
month = response.meta['month']
|
||||||
day = response.meta['day']
|
day = response.meta['day']
|
||||||
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
|
data, next_url = common_parser(html=response.text, page='birth', month=month, day=day)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||||
|
for item in data:
|
||||||
|
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self.logger.warning(f"parse data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDPersonDetailItem()
|
|
||||||
#yield item
|
|
||||||
|
|
||||||
|
# 获得列表,查询详情
|
||||||
def parse_ethnic_list_page(self, response):
|
def parse_ethnic_list_page(self, response):
|
||||||
div_root = response.css('select#ethnicity1')
|
div_root = response.css('select#ethnicity1')
|
||||||
if div_root:
|
if div_root:
|
||||||
options = div_root.css('option')
|
options = div_root.css('option')
|
||||||
self.crawler.stats.set_value(f"{self.name}/ethnic_all", len(options))
|
|
||||||
self.crawler.stats.set_value(f"{self.name}/ethnic_done", 0)
|
|
||||||
for option in options:
|
for option in options:
|
||||||
href = option.attrib.get('value')
|
href = option.attrib.get('value')
|
||||||
text = option.css('::text').get().strip()
|
text = option.css('::text').get().strip()
|
||||||
if href and href.lower() != 'none':
|
if href and href.lower() != 'none':
|
||||||
ethnic_url = urljoin(response.url , href)
|
ethnic_url = urljoin(response.url , href)
|
||||||
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
|
self.logger.info(f"ethnic: ({text}), start url: {ethnic_url}")
|
||||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
item = IafdMetaEthnicItem()
|
||||||
|
item['name'] = text
|
||||||
|
item['href'] = ethnic_url
|
||||||
|
yield item
|
||||||
|
|
||||||
|
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"parse page error. url: {response.url}")
|
||||||
|
|
||||||
|
# 获得列表,查询详情
|
||||||
def parse_ethnic_page(self, response):
|
def parse_ethnic_page(self, response):
|
||||||
ethnic = response.meta['ethnic']
|
ethnic = response.meta['ethnic']
|
||||||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||||
|
for item in data:
|
||||||
|
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||||
|
|
||||||
|
if next_url:
|
||||||
|
yield scrapy.Request(next_url, callback=self.parse_ethnic_page, meta={'ethnic': text})
|
||||||
|
else:
|
||||||
|
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self.logger.warning(f"parse data error. {response.url}")
|
||||||
|
|
||||||
if next_url:
|
|
||||||
self.logger.info(f"find next page: {next_url}")
|
|
||||||
else:
|
|
||||||
self.logger.info(f"found all pages. url: {response.url}")
|
|
||||||
|
|
||||||
item = IAFDPersonDetailItem()
|
|
||||||
#yield item
|
|
||||||
|
|
||||||
def parse_distributors_list_page(self, response):
|
def parse_distributors_list_page(self, response):
|
||||||
select_element = response.css('select[name="Distrib"]')
|
select_element = response.css('select[name="Distrib"]')
|
||||||
if select_element:
|
if select_element:
|
||||||
@ -178,7 +195,15 @@ class IAFDSpider(BaseSpider):
|
|||||||
value = option.attrib.get('value')
|
value = option.attrib.get('value')
|
||||||
text = option.css('::text').get().strip()
|
text = option.css('::text').get().strip()
|
||||||
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
|
dis_url = f"{self.host_url}/distrib.rme/distrib={value}"
|
||||||
|
item = IafdDistributorsItem()
|
||||||
|
item['name'] = text
|
||||||
|
item['href'] = dis_url
|
||||||
|
|
||||||
|
yield item
|
||||||
|
|
||||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"parse page error. url: {response.url}")
|
||||||
|
|
||||||
def parse_studios_list_page(self, response):
|
def parse_studios_list_page(self, response):
|
||||||
select_element = response.css('select[name="Studio"]')
|
select_element = response.css('select[name="Studio"]')
|
||||||
@ -188,53 +213,156 @@ class IAFDSpider(BaseSpider):
|
|||||||
value = option.attrib.get('value')
|
value = option.attrib.get('value')
|
||||||
text = option.css('::text').get().strip()
|
text = option.css('::text').get().strip()
|
||||||
dis_url = f"{self.host_url}/studio.rme/studio={value}"
|
dis_url = f"{self.host_url}/studio.rme/studio={value}"
|
||||||
|
item = IafdStudiosItem()
|
||||||
|
item['name'] = text
|
||||||
|
item['href'] = dis_url
|
||||||
|
yield item
|
||||||
|
|
||||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"parse page error. url: {response.url}")
|
||||||
|
|
||||||
def parse_stu_dist_page(self, response):
|
def parse_stu_dist_page(self, response):
|
||||||
list_type = response.meta.get('list_type', '')
|
list_type = response.meta.get('list_type', '')
|
||||||
data, next_url = common_parser(html=response.text, page=list_type)
|
data, next_url = common_parser(html=response.text, page=list_type)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
|
for movie in data:
|
||||||
|
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self.logger.warning(f"fetched data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDPersonDetailItem()
|
# 统一处理发起影片查询的请求
|
||||||
#yield item
|
def _create_performer_request(self, href, name):
|
||||||
|
if href != '' and is_valid_url(href):
|
||||||
|
if self._can_request(href):
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||||||
|
yield scrapy.Request(href,
|
||||||
|
callback=self.parse_person_detail_page,
|
||||||
|
meta={'name': name, 'item_type':'movie'}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"wrong url. {url}, ignore...")
|
||||||
|
|
||||||
|
# 统一处理发起影片查询的请求
|
||||||
|
def _create_movie_request(self, href, title):
|
||||||
|
if href != '' and is_valid_url(href):
|
||||||
|
if self.need_update_movie(href) and self._can_request(href):
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/movie_all")
|
||||||
|
yield scrapy.Request(href,
|
||||||
|
callback=self.parse_movie_detail_page,
|
||||||
|
meta={'title': title, 'item_type':'movie'},
|
||||||
|
cache=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"wrong url. {url}, ignore...")
|
||||||
|
|
||||||
|
# 演员详情页解析和处理
|
||||||
def parse_person_detail_page(self, response):
|
def parse_person_detail_page(self, response):
|
||||||
data = common_parser(html=response.text, page='actor', url=response.url)
|
data = common_parser(html=response.text, page='actor', url=response.url)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||||||
|
item = IafdPerformersItem()
|
||||||
|
for k, v in data.items():
|
||||||
|
if k in item.fields:
|
||||||
|
item[k] = v
|
||||||
|
|
||||||
|
yield item
|
||||||
|
|
||||||
|
# 处理影片列表
|
||||||
|
for role, movies in data.get('credits', {}).items():
|
||||||
|
if movies:
|
||||||
|
for item in movies:
|
||||||
|
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self.logger.warning(f"fetched data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDPersonDetailItem()
|
# 影片详情页解析和处理
|
||||||
#yield item
|
|
||||||
|
|
||||||
def parse_movie_detail_page(self, response):
|
def parse_movie_detail_page(self, response):
|
||||||
title = response.meta.get('title', '')
|
title = response.meta.get('title', '')
|
||||||
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
data = common_parser(html=response.text, page='movies', href=response.url, title=title)
|
||||||
if data:
|
if data:
|
||||||
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
self.logger.debug(f"fetched data from {response.url}, data: {data}")
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/movie_done")
|
||||||
|
item = IafdMoviesItem()
|
||||||
|
for k, v in data.items():
|
||||||
|
if k in item.fields:
|
||||||
|
item[k] = v
|
||||||
|
yield item
|
||||||
|
|
||||||
|
# 处理各种链接
|
||||||
|
link_url = data.get('DistributorHref', '')
|
||||||
|
if is_valid_url(link_url) and self._can_request(link_url):
|
||||||
|
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||||
|
|
||||||
|
link_url = data.get('StudioHref', '')
|
||||||
|
if is_valid_url(link_url) and self._can_request(link_url):
|
||||||
|
yield scrapy.Request(link_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||||
|
|
||||||
|
link_url = data.get('DirectorHref', '')
|
||||||
|
yield from self._create_performer_request(href=link_url, name=data.get('Director'))
|
||||||
|
|
||||||
|
for director in data.get('Directors', []):
|
||||||
|
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self.logger.warning(f"fetched data error. {response.url}")
|
||||||
|
|
||||||
item = IAFDMovieDetailItem()
|
|
||||||
#yield item
|
|
||||||
|
|
||||||
def custom_block_check(self, response):
|
# 统一判断并处理异常
|
||||||
item_type = response.meta.get('item_type', '')
|
def _handle_invalid_response(self, response):
|
||||||
if "invalid or outdated page" in response.text.lower():
|
if response.status in [200]:
|
||||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
|
if "invalid or outdated page" in response.text.lower():
|
||||||
return "invalid or outdated page"
|
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||||
else:
|
# TODO: 更新404的演员或者影片
|
||||||
self.logger.info(f"right content. url: {response.url}")
|
else:
|
||||||
|
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||||
return None
|
|
||||||
|
|
||||||
# 处理页面异常,主要是404, 403
|
elif response.status in [404, 403]:
|
||||||
def handle_blocked(self, response, reason):
|
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||||
item_type = response.meta.get('item_type', '')
|
# TODO: 更新404的演员或者影片
|
||||||
if response.status in [404, 403]:
|
|
||||||
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
|
else:
|
||||||
|
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_existed_actors(self):
|
||||||
|
query_args = {}
|
||||||
|
rows = db_tools.query_performer_hrefs(**query_args)
|
||||||
|
if rows:
|
||||||
|
for item in rows:
|
||||||
|
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"query_performer_hrefs empty. query args: {query_args}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_existed_movies(self):
|
||||||
|
query_args = {}
|
||||||
|
rows = db_tools.query_movie_hrefs(**query_args)
|
||||||
|
if rows:
|
||||||
|
for item in rows:
|
||||||
|
self.existed_movies[item['href']] = item['is_full_data']
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"query_movies empty. query args: {query_args}")
|
||||||
|
|
||||||
|
# 内存缓存,也可以改为查询db
|
||||||
|
def need_update_movie(self, href):
|
||||||
|
return not (href in self.existed_movies and self.existed_movies[href] >0)
|
||||||
|
|
||||||
|
# 内存缓存,也可以改为查询db
|
||||||
|
def need_update_actor(self, href, movies_cnt):
|
||||||
|
if href not in self.existed_actors:
|
||||||
|
return True
|
||||||
|
data = self.existed_actors[href]
|
||||||
|
if data['is_full_data'] <=0 :
|
||||||
|
return True
|
||||||
|
if data['movies_cnt'] < movies_cnt:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def add_actor_to_existed(self, href, movies_cnt, is_full_data=1):
|
||||||
|
self.existed_actors[href] = {'is_full_data': is_full_data, 'movies_cnt': movies_cnt}
|
||||||
|
|
||||||
|
def acc_movie_to_existed(self, href, is_full_data=1):
|
||||||
|
self.existed_movies[href] = is_full_data
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import cloudscraper
|
import cloudscraper
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
@ -11,6 +10,7 @@ import re
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from datetime import datetime
|
||||||
#import config
|
#import config
|
||||||
#import utils
|
#import utils
|
||||||
|
|
||||||
@ -410,8 +410,8 @@ def parse_page_performer(soup, url):
|
|||||||
'nationality': 'Nationality',
|
'nationality': 'Nationality',
|
||||||
'hair_colors': 'Hair Colors',
|
'hair_colors': 'Hair Colors',
|
||||||
'eye_color': 'Eye Color',
|
'eye_color': 'Eye Color',
|
||||||
'height': 'Height',
|
'height_str': 'Height',
|
||||||
'weight': 'Weight',
|
'weight_str': 'Weight',
|
||||||
'measurements': 'Measurements',
|
'measurements': 'Measurements',
|
||||||
'tattoos': 'Tattoos',
|
'tattoos': 'Tattoos',
|
||||||
'piercings': 'Piercings'
|
'piercings': 'Piercings'
|
||||||
@ -474,6 +474,20 @@ def parse_page_performer(soup, url):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def extract_year_from_date_string(date_str):
|
||||||
|
"""
|
||||||
|
从 "Apr 23, 2021" 格式的字符串中提取年份
|
||||||
|
|
||||||
|
:param date_str: 待解析的日期字符串(如 "Apr 23, 2021")
|
||||||
|
:return: 提取的年份(int类型),若解析失败返回 None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
date_obj = datetime.strptime(date_str, "%b %d, %Y")
|
||||||
|
return date_obj.year
|
||||||
|
except ValueError:
|
||||||
|
return 0
|
||||||
|
except TypeError:
|
||||||
|
return 0
|
||||||
|
|
||||||
# 解析网页 HTML 并提取电影信息
|
# 解析网页 HTML 并提取电影信息
|
||||||
def parse_page_movie(soup, href, title):
|
def parse_page_movie(soup, href, title):
|
||||||
@ -595,19 +609,20 @@ def parse_page_movie(soup, href, title):
|
|||||||
return {
|
return {
|
||||||
"href": href,
|
"href": href,
|
||||||
"title": title,
|
"title": title,
|
||||||
"Minutes": movie_data.get("Minutes", ""),
|
"minutes": movie_data.get("Minutes", ""),
|
||||||
"Distributor": movie_data.get("Distributor", ""),
|
"release_date": movie_data.get("Release Date", ""),
|
||||||
"Studio": movie_data.get("Studio", ""),
|
"added_to_IAFD_date": movie_data.get("Date Added to IAFD", ""),
|
||||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
"all_girl": movie_data.get("All-Girl", ""),
|
||||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
"all_male": movie_data.get("All-Male", ""),
|
||||||
"All-Girl": movie_data.get("All-Girl", ""),
|
"compilation": movie_data.get("Compilation", ""),
|
||||||
"All-Male": movie_data.get("All-Male", ""),
|
"webscene": movie_data.get("Webscene", ""),
|
||||||
"Compilation": movie_data.get("Compilation", ""),
|
'release_year': extract_year_from_date_string(movie_data.get("Release Date", "")),
|
||||||
"Webscene": movie_data.get("Webscene", ""),
|
|
||||||
"Director": movie_data.get("Director", ""),
|
"Director": movie_data.get("Director", ""),
|
||||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
"Studio": movie_data.get("Studio", ""),
|
||||||
"StudioHref": movie_data.get("StudioHref", ""),
|
"StudioHref": movie_data.get("StudioHref", ""),
|
||||||
|
"Distributor": movie_data.get("Distributor", ""),
|
||||||
|
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||||
"Directors": movie_data.get("Directors", []), # 可能存在的元素
|
"Directors": movie_data.get("Directors", []), # 可能存在的元素
|
||||||
"Performers": performers,
|
"Performers": performers,
|
||||||
"SceneBreakdowns": scene_breakdowns,
|
"SceneBreakdowns": scene_breakdowns,
|
||||||
|
|||||||
Reference in New Issue
Block a user