modify scripts
This commit is contained in:
@ -978,8 +978,8 @@ class JavBusDBHandler(SQLiteDBHandler):
|
|||||||
|
|
||||||
@register_handler(comm.SPIDER_NAME_IAFD)
|
@register_handler(comm.SPIDER_NAME_IAFD)
|
||||||
class IAFDDBHandler(SQLiteDBHandler):
|
class IAFDDBHandler(SQLiteDBHandler):
|
||||||
#def __init__(self, db_path=shared_db_path):
|
def __init__(self, db_path=shared_db_path):
|
||||||
def __init__(self, db_path=test_db_path):
|
#def __init__(self, db_path=test_db_path):
|
||||||
super().__init__(db_path)
|
super().__init__(db_path)
|
||||||
self.tbl_name_performers = 'iafd_performers'
|
self.tbl_name_performers = 'iafd_performers'
|
||||||
self.tbl_name_movies = 'iafd_movies'
|
self.tbl_name_movies = 'iafd_movies'
|
||||||
|
|||||||
@ -134,7 +134,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
for item in data:
|
for item in data:
|
||||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self._handle_invalid_response(response, page='astro')
|
||||||
|
|
||||||
# 获得列表,查询详情
|
# 获得列表,查询详情
|
||||||
def parse_birth_page(self, response):
|
def parse_birth_page(self, response):
|
||||||
@ -146,7 +146,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
for item in data:
|
for item in data:
|
||||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self._handle_invalid_response(response, page='birth')
|
||||||
|
|
||||||
# 获得列表,查询详情
|
# 获得列表,查询详情
|
||||||
def parse_ethnic_list_page(self, response):
|
def parse_ethnic_list_page(self, response):
|
||||||
@ -166,7 +166,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
|
|
||||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
|
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse page error. url: {response.url}")
|
self._handle_invalid_response(response, page='ethnic_list')
|
||||||
|
|
||||||
# 获得列表,查询详情
|
# 获得列表,查询详情
|
||||||
def parse_ethnic_page(self, response):
|
def parse_ethnic_page(self, response):
|
||||||
@ -187,7 +187,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
else:
|
else:
|
||||||
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self._handle_invalid_response(response, page='ethnic')
|
||||||
|
|
||||||
def parse_distributors_list_page(self, response):
|
def parse_distributors_list_page(self, response):
|
||||||
select_element = response.css('select[name="Distrib"]')
|
select_element = response.css('select[name="Distrib"]')
|
||||||
@ -205,7 +205,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
|
|
||||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse page error. url: {response.url}")
|
self._handle_invalid_response(response, page='dist_list')
|
||||||
|
|
||||||
def parse_studios_list_page(self, response):
|
def parse_studios_list_page(self, response):
|
||||||
select_element = response.css('select[name="Studio"]')
|
select_element = response.css('select[name="Studio"]')
|
||||||
@ -222,7 +222,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
|
|
||||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse page error. url: {response.url}")
|
self._handle_invalid_response(response, page='stu_list')
|
||||||
|
|
||||||
def parse_stu_dist_page(self, response):
|
def parse_stu_dist_page(self, response):
|
||||||
list_type = response.meta.get('list_type', '')
|
list_type = response.meta.get('list_type', '')
|
||||||
@ -232,7 +232,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
for movie in data:
|
for movie in data:
|
||||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self._handle_invalid_response(response, page='dist_stu')
|
||||||
|
|
||||||
# 统一处理发起影片查询的请求
|
# 统一处理发起影片查询的请求
|
||||||
def _create_performer_request(self, href, name):
|
def _create_performer_request(self, href, name):
|
||||||
@ -284,7 +284,7 @@ class IAFDSpider(BaseSpider):
|
|||||||
for item in movies:
|
for item in movies:
|
||||||
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||||||
else:
|
else:
|
||||||
self._handle_invalid_response(response)
|
self._handle_invalid_response(response, page='actor')
|
||||||
|
|
||||||
# 影片详情页解析和处理
|
# 影片详情页解析和处理
|
||||||
def parse_movie_detail_page(self, response):
|
def parse_movie_detail_page(self, response):
|
||||||
@ -316,34 +316,29 @@ class IAFDSpider(BaseSpider):
|
|||||||
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self._handle_invalid_response(response)
|
self._handle_invalid_response(response, page='movie')
|
||||||
|
|
||||||
# 统一判断并处理异常
|
# 统一判断并处理异常
|
||||||
def _handle_invalid_response(self, response):
|
def _handle_invalid_response(self, response, page=None):
|
||||||
update_flag = False
|
|
||||||
if response.status in [200]:
|
if response.status in [200]:
|
||||||
if "invalid or outdated page" in response.text.lower():
|
if "invalid or outdated page" in response.text.lower():
|
||||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||||
# TODO: 更新404的演员或者影片
|
|
||||||
update_flag = True
|
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||||
|
|
||||||
elif response.status in [404, 403]:
|
elif response.status in [404, 403]:
|
||||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||||
# TODO: 更新404的演员或者影片
|
|
||||||
update_flag = True
|
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||||
|
|
||||||
if update_flag:
|
if page:
|
||||||
if 'person.rme' in response.url:
|
if page == 'actor':
|
||||||
item = IafdPerformersItem()
|
item = IafdPerformersItem()
|
||||||
item['href'] = response.url
|
item['href'] = response.url
|
||||||
item['name'] = response.meta.get('name', '')
|
item['name'] = response.meta.get('name', '')
|
||||||
item['is_full_data'] = 404
|
item['is_full_data'] = 404
|
||||||
yield item
|
yield item
|
||||||
elif 'title.rme' in response.url:
|
elif page == 'movie':
|
||||||
item = IafdMoviesItem()
|
item = IafdMoviesItem()
|
||||||
item['href'] = response.url
|
item['href'] = response.url
|
||||||
item['title'] = response.meta.get('title', '')
|
item['title'] = response.meta.get('title', '')
|
||||||
|
|||||||
@ -133,7 +133,7 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
yield scrapy.Request(url,
|
yield scrapy.Request(url,
|
||||||
callback=self.parse_actor_detail_page,
|
callback=self.parse_actor_detail_page,
|
||||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||||||
meta={'lang': lang, 'actor_name': name, 'actor_url': url })
|
meta={'lang': lang, 'actor_name': name, 'actor_url': url, 'item_type':'actor' })
|
||||||
|
|
||||||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||||||
if next_url:
|
if next_url:
|
||||||
@ -143,7 +143,7 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
meta=response.meta
|
meta=response.meta
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self._handle_invalid_response(response, page='actor_list')
|
||||||
|
|
||||||
|
|
||||||
# 处理详细的解析页面
|
# 处理详细的解析页面
|
||||||
@ -160,7 +160,7 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
avatar = data.get('avatar',{})
|
avatar = data.get('avatar',{})
|
||||||
item = JavbusActorsItem()
|
item = JavbusActorsItem()
|
||||||
item['href'] = normalize_url(actor_url)
|
item['href'] = normalize_url(actor_url)
|
||||||
item[f"{lang}_name"] = avatar['name']
|
item[f"{lang}_name"] = avatar.get('name', '')
|
||||||
yield item
|
yield item
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -177,7 +177,7 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
yield scrapy.Request(next_url,
|
yield scrapy.Request(next_url,
|
||||||
callback=self.parse_actor_detail_page,
|
callback=self.parse_actor_detail_page,
|
||||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||||||
meta={'lang': lang, 'actor_name': actor_name, 'actor_url': actor_url })
|
meta={'lang': lang, 'actor_name': actor_name, 'actor_url': actor_url, 'item_type':'actor' })
|
||||||
else:
|
else:
|
||||||
self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}")
|
self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}")
|
||||||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||||||
@ -209,7 +209,7 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
meta={'title': item.get('title', ''), 'item_type':'movie', 'cache':True}
|
meta={'title': item.get('title', ''), 'item_type':'movie', 'cache':True}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self._handle_invalid_response(response, page='actor')
|
||||||
|
|
||||||
|
|
||||||
def parse_movie_detail_page(self, response):
|
def parse_movie_detail_page(self, response):
|
||||||
@ -268,7 +268,7 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
prefix = 'series'
|
prefix = 'series'
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"fetched data error. {response.url}")
|
self._handle_invalid_response(response, page='movie')
|
||||||
|
|
||||||
def _create_multi_langs_request(self, href, name, callback, prefix):
|
def _create_multi_langs_request(self, href, name, callback, prefix):
|
||||||
"""创建单个对象的多语言请求"""
|
"""创建单个对象的多语言请求"""
|
||||||
@ -360,24 +360,34 @@ class JavbusSpiderSpider(BaseSpider):
|
|||||||
else:
|
else:
|
||||||
self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}")
|
self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}")
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"parse data error. {response.url}")
|
self._handle_invalid_response(response, page='movie_list')
|
||||||
|
|
||||||
|
|
||||||
def custom_block_check(self, response):
|
|
||||||
item_type = response.meta.get('item_type', '')
|
|
||||||
if "invalid or outdated page" in response.text.lower():
|
|
||||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
|
|
||||||
return "invalid or outdated page"
|
|
||||||
else:
|
|
||||||
self.logger.info(f"right content. url: {response.url}")
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 处理页面异常,主要是404, 403
|
# 统一判断并处理异常
|
||||||
def handle_blocked(self, response, reason):
|
def _handle_invalid_response(self, response, page=None):
|
||||||
item_type = response.meta.get('item_type', '')
|
if response.status in [200]:
|
||||||
if response.status in [404, 403]:
|
if "404 Page Not Found" in response.text.lower():
|
||||||
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
|
self.logger.warning(f"404 Page Not Found. url: {response.url}, status_code: {response.status}")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||||
|
elif response.status in [404, 403]:
|
||||||
|
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||||
|
|
||||||
|
if page:
|
||||||
|
if page == 'actor':
|
||||||
|
item = JavbusActorsItem()
|
||||||
|
item['href'] = response.url
|
||||||
|
item['zh_name'] = response.meta.get('actor_name', '')
|
||||||
|
item['is_full_data'] = 404
|
||||||
|
yield item
|
||||||
|
elif page == 'movie' :
|
||||||
|
item = JavbusMoviesItem()
|
||||||
|
item['href'] = response.url
|
||||||
|
item['title'] = response.meta.get('title', '')
|
||||||
|
item['is_full_data'] = 404
|
||||||
|
yield item
|
||||||
|
|
||||||
def load_existed_actors(self):
|
def load_existed_actors(self):
|
||||||
query_args = {}
|
query_args = {}
|
||||||
|
|||||||
@ -347,7 +347,7 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
# 解析标题
|
# 解析标题
|
||||||
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
|
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
|
||||||
if not b_tag:
|
if not b_tag:
|
||||||
logging.warning(f'found no title. href: {href}')
|
logging.debug(f'found no title. href: {href}')
|
||||||
else:
|
else:
|
||||||
# 获取文本内容
|
# 获取文本内容
|
||||||
title_text = b_tag.get_text(strip=True)
|
title_text = b_tag.get_text(strip=True)
|
||||||
@ -372,7 +372,7 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
# 查找a标签
|
# 查找a标签
|
||||||
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
|
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
|
||||||
if not a_tags:
|
if not a_tags:
|
||||||
logging.warning(f'found no movie cnt. href: {href}')
|
logging.debug(f'found no movie cnt. href: {href}')
|
||||||
else:
|
else:
|
||||||
for a in a_tags:
|
for a in a_tags:
|
||||||
text = a.get_text(strip=True)
|
text = a.get_text(strip=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user