modify scripts

This commit is contained in:
2025-07-28 10:02:40 +08:00
parent eb3b27ecb0
commit db709f3ba2
5 changed files with 2687 additions and 44 deletions

View File

@ -978,8 +978,8 @@ class JavBusDBHandler(SQLiteDBHandler):
@register_handler(comm.SPIDER_NAME_IAFD)
class IAFDDBHandler(SQLiteDBHandler):
#def __init__(self, db_path=shared_db_path):
def __init__(self, db_path=test_db_path):
def __init__(self, db_path=shared_db_path):
#def __init__(self, db_path=test_db_path):
super().__init__(db_path)
self.tbl_name_performers = 'iafd_performers'
self.tbl_name_movies = 'iafd_movies'

View File

@ -134,7 +134,7 @@ class IAFDSpider(BaseSpider):
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
else:
self.logger.warning(f"parse data error. {response.url}")
self._handle_invalid_response(response, page='astro')
# 获得列表,查询详情
def parse_birth_page(self, response):
@ -146,7 +146,7 @@ class IAFDSpider(BaseSpider):
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
else:
self.logger.warning(f"parse data error. {response.url}")
self._handle_invalid_response(response, page='birth')
# 获得列表,查询详情
def parse_ethnic_list_page(self, response):
@ -166,7 +166,7 @@ class IAFDSpider(BaseSpider):
yield scrapy.Request(ethnic_url, callback=self.parse_ethnic_page, meta={'ethnic': text, 'depth':1})
else:
self.logger.warning(f"parse page error. url: {response.url}")
self._handle_invalid_response(response, page='ethnic_list')
# 获得列表,查询详情
def parse_ethnic_page(self, response):
@ -187,7 +187,7 @@ class IAFDSpider(BaseSpider):
else:
self.logger.info(f"found all pages. ethnic: {ethnic}, url: {response.url}")
else:
self.logger.warning(f"parse data error. {response.url}")
self._handle_invalid_response(response, page='ethnic')
def parse_distributors_list_page(self, response):
select_element = response.css('select[name="Distrib"]')
@ -205,7 +205,7 @@ class IAFDSpider(BaseSpider):
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'dist'})
else:
self.logger.warning(f"parse page error. url: {response.url}")
self._handle_invalid_response(response, page='dist_list')
def parse_studios_list_page(self, response):
select_element = response.css('select[name="Studio"]')
@ -222,7 +222,7 @@ class IAFDSpider(BaseSpider):
yield scrapy.Request(dis_url, callback=self.parse_stu_dist_page, meta={'list_type': 'stu'})
else:
self.logger.warning(f"parse page error. url: {response.url}")
self._handle_invalid_response(response, page='stu_list')
def parse_stu_dist_page(self, response):
list_type = response.meta.get('list_type', '')
@ -232,7 +232,7 @@ class IAFDSpider(BaseSpider):
for movie in data:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
else:
self.logger.warning(f"fetched data error. {response.url}")
self._handle_invalid_response(response, page='dist_stu')
# 统一处理发起影片查询的请求
def _create_performer_request(self, href, name):
@ -284,7 +284,7 @@ class IAFDSpider(BaseSpider):
for item in movies:
yield from self._create_movie_request(href=item['href'], title=item['title'])
else:
self._handle_invalid_response(response)
self._handle_invalid_response(response, page='actor')
# 影片详情页解析和处理
def parse_movie_detail_page(self, response):
@ -316,34 +316,29 @@ class IAFDSpider(BaseSpider):
yield from self._create_performer_request(href=director['href'], name=director['name'])
else:
self._handle_invalid_response(response)
self._handle_invalid_response(response, page='movie')
# 统一判断并处理异常
def _handle_invalid_response(self, response):
update_flag = False
def _handle_invalid_response(self, response, page=None):
if response.status in [200]:
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
# TODO: 更新404的演员或者影片
update_flag = True
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
# TODO: 更新404的演员或者影片
update_flag = True
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
if update_flag:
if 'person.rme' in response.url:
if page:
if page == 'actor':
item = IafdPerformersItem()
item['href'] = response.url
item['name'] = response.meta.get('name', '')
item['is_full_data'] = 404
yield item
elif 'title.rme' in response.url:
elif page == 'movie':
item = IafdMoviesItem()
item['href'] = response.url
item['title'] = response.meta.get('title', '')

View File

@ -133,7 +133,7 @@ class JavbusSpiderSpider(BaseSpider):
yield scrapy.Request(url,
callback=self.parse_actor_detail_page,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
meta={'lang': lang, 'actor_name': name, 'actor_url': url })
meta={'lang': lang, 'actor_name': name, 'actor_url': url, 'item_type':'actor' })
self.crawler.stats.inc_value(f"{self.name}/actor_all")
if next_url:
@ -143,7 +143,7 @@ class JavbusSpiderSpider(BaseSpider):
meta=response.meta
)
else:
self.logger.warning(f"parse data error. {response.url}")
self._handle_invalid_response(response, page='actor_list')
# 处理详细的解析页面
@ -160,7 +160,7 @@ class JavbusSpiderSpider(BaseSpider):
avatar = data.get('avatar',{})
item = JavbusActorsItem()
item['href'] = normalize_url(actor_url)
item[f"{lang}_name"] = avatar['name']
item[f"{lang}_name"] = avatar.get('name', '')
yield item
return None
@ -177,7 +177,7 @@ class JavbusSpiderSpider(BaseSpider):
yield scrapy.Request(next_url,
callback=self.parse_actor_detail_page,
headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), # 使用GET头
meta={'lang': lang, 'actor_name': actor_name, 'actor_url': actor_url })
meta={'lang': lang, 'actor_name': actor_name, 'actor_url': actor_url, 'item_type':'actor' })
else:
self.logger.info(f"actor ({actor_name}) read all pages. url :{response.url}")
self.crawler.stats.inc_value(f"{self.name}/actor_done")
@ -209,7 +209,7 @@ class JavbusSpiderSpider(BaseSpider):
meta={'title': item.get('title', ''), 'item_type':'movie', 'cache':True}
)
else:
self.logger.warning(f"fetched data error. {response.url}")
self._handle_invalid_response(response, page='actor')
def parse_movie_detail_page(self, response):
@ -268,7 +268,7 @@ class JavbusSpiderSpider(BaseSpider):
prefix = 'series'
)
else:
self.logger.warning(f"fetched data error. {response.url}")
self._handle_invalid_response(response, page='movie')
def _create_multi_langs_request(self, href, name, callback, prefix):
"""创建单个对象的多语言请求"""
@ -360,24 +360,34 @@ class JavbusSpiderSpider(BaseSpider):
else:
self.logger.info(f"movies list ({prefix}) read all pages. url :{response.url}")
else:
self.logger.warning(f"parse data error. {response.url}")
self._handle_invalid_response(response, page='movie_list')
def custom_block_check(self, response):
item_type = response.meta.get('item_type', '')
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, item_type: {item_type}")
return "invalid or outdated page"
else:
self.logger.info(f"right content. url: {response.url}")
return None
# 处理页面异常主要是404, 403
def handle_blocked(self, response, reason):
item_type = response.meta.get('item_type', '')
if response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}, item_type: {item_type}")
# 统一判断并处理异常
def _handle_invalid_response(self, response, page=None):
if response.status in [200]:
if "404 Page Not Found" in response.text.lower():
self.logger.warning(f"404 Page Not Found. url: {response.url}, status_code: {response.status}")
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
if page:
if page == 'actor':
item = JavbusActorsItem()
item['href'] = response.url
item['zh_name'] = response.meta.get('actor_name', '')
item['is_full_data'] = 404
yield item
elif page == 'movie' :
item = JavbusMoviesItem()
item['href'] = response.url
item['title'] = response.meta.get('title', '')
item['is_full_data'] = 404
yield item
def load_existed_actors(self):
query_args = {}

View File

@ -347,7 +347,7 @@ class JavbusCrawler(GenericCrawler):
# 解析标题
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
if not b_tag:
logging.warning(f'found no title. href: {href}')
logging.debug(f'found no title. href: {href}')
else:
# 获取文本内容
title_text = b_tag.get_text(strip=True)
@ -372,7 +372,7 @@ class JavbusCrawler(GenericCrawler):
# 查找a标签
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
if not a_tags:
logging.warning(f'found no movie cnt. href: {href}')
logging.debug(f'found no movie cnt. href: {href}')
else:
for a in a_tags:
text = a.get_text(strip=True)

2638
warn.log Normal file

File diff suppressed because it is too large Load Diff