modify scripts
This commit is contained in:
@ -978,8 +978,8 @@ class JavBusDBHandler(SQLiteDBHandler):
|
||||
|
||||
@register_handler(comm.SPIDER_NAME_IAFD)
|
||||
class IAFDDBHandler(SQLiteDBHandler):
|
||||
#def __init__(self, db_path=shared_db_path):
|
||||
def __init__(self, db_path=test_db_path):
|
||||
def __init__(self, db_path=shared_db_path):
|
||||
#def __init__(self, db_path=test_db_path):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_performers = 'iafd_performers'
|
||||
self.tbl_name_movies = 'iafd_movies'
|
||||
|
||||
@ -134,7 +134,7 @@ class IAFDSpider(BaseSpider):
|
||||
for item in data:
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='astro')
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_birth_page(self, response):
|
||||
@ -146,7 +146,7 @@ class IAFDSpider(BaseSpider):
|
||||
for item in data:
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='birth')
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_ethnic_list_page(self, response):
|
||||
@ -166,7 +166,7 @@ class IAFDSpider(BaseSpider):
|
||||
|
||||
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
self._handle_invalid_response(response, page='ethnic_list')
|
||||
|
||||
# 获得列表,查询详情
|
||||
def parse_ethnic_page(self, response):
|
||||
@ -187,7 +187,7 @@ class IAFDSpider(BaseSpider):
|
||||
else:
|
||||
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='ethnic')
|
||||
|
||||
def parse_distributors_list_page(self, response):
|
||||
select_element = response.css('select[name="Distrib"]')
|
||||
@ -205,7 +205,7 @@ class IAFDSpider(BaseSpider):
|
||||
|
||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
self._handle_invalid_response(response, page='dist_list')
|
||||
|
||||
def parse_studios_list_page(self, response):
|
||||
select_element = response.css('select[name="Studio"]')
|
||||
@ -222,7 +222,7 @@ class IAFDSpider(BaseSpider):
|
||||
|
||||
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
|
||||
else:
|
||||
self.logger.warning(f"parse page error. url: {response.url}")
|
||||
self._handle_invalid_response(response, page='stu_list')
|
||||
|
||||
def parse_stu_dist_page(self, response):
|
||||
list_type = response.meta.get('list_type', '')
|
||||
@ -232,7 +232,7 @@ class IAFDSpider(BaseSpider):
|
||||
for movie in data:
|
||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='dist_stu')
|
||||
|
||||
# 统一处理发起影片查询的请求
|
||||
def _create_performer_request(self, href, name):
|
||||
@ -284,7 +284,7 @@ class IAFDSpider(BaseSpider):
|
||||
for item in movies:
|
||||
yield from self._create_movie_request(href=item['href'], title=item['title'])
|
||||
else:
|
||||
self._handle_invalid_response(response)
|
||||
self._handle_invalid_response(response, page='actor')
|
||||
|
||||
# 影片详情页解析和处理
|
||||
def parse_movie_detail_page(self, response):
|
||||
@ -316,34 +316,29 @@ class IAFDSpider(BaseSpider):
|
||||
yield from self._create_performer_request(href=director['href'], name=director['name'])
|
||||
|
||||
else:
|
||||
self._handle_invalid_response(response)
|
||||
self._handle_invalid_response(response, page='movie')
|
||||
|
||||
# 统一判断并处理异常
|
||||
def _handle_invalid_response(self, response):
|
||||
update_flag = False
|
||||
def _handle_invalid_response(self, response, page=None):
|
||||
if response.status in [200]:
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||
# TODO: 更新404的演员或者影片
|
||||
update_flag = True
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||
|
||||
elif response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||
# TODO: 更新404的演员或者影片
|
||||
update_flag = True
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||
|
||||
if update_flag:
|
||||
if 'person.rme' in response.url:
|
||||
if page:
|
||||
if page == 'actor':
|
||||
item = IafdPerformersItem()
|
||||
item['href'] = response.url
|
||||
item['name'] = response.meta.get('name', '')
|
||||
item['is_full_data'] = 404
|
||||
yield item
|
||||
elif 'title.rme' in response.url:
|
||||
elif page == 'movie':
|
||||
item = IafdMoviesItem()
|
||||
item['href'] = response.url
|
||||
item['title'] = response.meta.get('title', '')
|
||||
|
||||
@ -133,7 +133,7 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
yield scrapy.Request(url,
|
||||
callback=self.parse_actor_detail_page,
|
||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||||
meta={'lang': lang, 'actor_name': name, 'actor_url': url })
|
||||
meta={'lang': lang, 'actor_name': name, 'actor_url': url, 'item_type':'actor' })
|
||||
|
||||
self.crawler.stats.inc_value(f"{self.name}/actor_all")
|
||||
if next_url:
|
||||
@ -143,7 +143,7 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
meta=response.meta
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='actor_list')
|
||||
|
||||
|
||||
# 处理详细的解析页面
|
||||
@ -160,7 +160,7 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
avatar = data.get('avatar',{})
|
||||
item = JavbusActorsItem()
|
||||
item['href'] = normalize_url(actor_url)
|
||||
item[f"{lang}_name"] = avatar['name']
|
||||
item[f"{lang}_name"] = avatar.get('name', '')
|
||||
yield item
|
||||
return None
|
||||
|
||||
@ -177,7 +177,7 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
yield scrapy.Request(next_url,
|
||||
callback=self.parse_actor_detail_page,
|
||||
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
|
||||
meta={'lang': lang, 'actor_name': actor_name, 'actor_url': actor_url })
|
||||
meta={'lang': lang, 'actor_name': actor_name, 'actor_url': actor_url, 'item_type':'actor' })
|
||||
else:
|
||||
self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/actor_done")
|
||||
@ -209,7 +209,7 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
meta={'title': item.get('title', ''), 'item_type':'movie', 'cache':True}
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='actor')
|
||||
|
||||
|
||||
def parse_movie_detail_page(self, response):
|
||||
@ -268,7 +268,7 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
prefix = 'series'
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"fetched data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='movie')
|
||||
|
||||
def _create_multi_langs_request(self, href, name, callback, prefix):
|
||||
"""创建单个对象的多语言请求"""
|
||||
@ -360,24 +360,34 @@ class JavbusSpiderSpider(BaseSpider):
|
||||
else:
|
||||
self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}")
|
||||
else:
|
||||
self.logger.warning(f"parse data error. {response.url}")
|
||||
self._handle_invalid_response(response, page='movie_list')
|
||||
|
||||
|
||||
def custom_block_check(self, response):
|
||||
item_type = response.meta.get('item_type', '')
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
|
||||
return "invalid or outdated page"
|
||||
else:
|
||||
self.logger.info(f"right content. url: {response.url}")
|
||||
|
||||
return None
|
||||
|
||||
# 处理页面异常,主要是404, 403
|
||||
def handle_blocked(self, response, reason):
|
||||
item_type = response.meta.get('item_type', '')
|
||||
if response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
|
||||
# 统一判断并处理异常
|
||||
def _handle_invalid_response(self, response, page=None):
|
||||
if response.status in [200]:
|
||||
if "404 Page Not Found" in response.text.lower():
|
||||
self.logger.warning(f"404 Page Not Found. url: {response.url}, status_code: {response.status}")
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||
elif response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||
|
||||
if page:
|
||||
if page == 'actor':
|
||||
item = JavbusActorsItem()
|
||||
item['href'] = response.url
|
||||
item['zh_name'] = response.meta.get('actor_name', '')
|
||||
item['is_full_data'] = 404
|
||||
yield item
|
||||
elif page == 'movie' :
|
||||
item = JavbusMoviesItem()
|
||||
item['href'] = response.url
|
||||
item['title'] = response.meta.get('title', '')
|
||||
item['is_full_data'] = 404
|
||||
yield item
|
||||
|
||||
def load_existed_actors(self):
|
||||
query_args = {}
|
||||
|
||||
@ -347,7 +347,7 @@ class JavbusCrawler(GenericCrawler):
|
||||
# 解析标题
|
||||
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
|
||||
if not b_tag:
|
||||
logging.warning(f'found no title. href: {href}')
|
||||
logging.debug(f'found no title. href: {href}')
|
||||
else:
|
||||
# 获取文本内容
|
||||
title_text = b_tag.get_text(strip=True)
|
||||
@ -372,7 +372,7 @@ class JavbusCrawler(GenericCrawler):
|
||||
# 查找a标签
|
||||
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
|
||||
if not a_tags:
|
||||
logging.warning(f'found no movie cnt. href: {href}')
|
||||
logging.debug(f'found no movie cnt. href: {href}')
|
||||
else:
|
||||
for a in a_tags:
|
||||
text = a.get_text(strip=True)
|
||||
|
||||
Reference in New Issue
Block a user