modify scripts
This commit is contained in:
@ -247,7 +247,7 @@ def fetch_performers_detail_once(perfomers_list):
|
||||
logging.debug(f"Fetching data for performer ({person}), url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
# 从本地读取的文件,忽略
|
||||
if skip_local and status_code == 99 :
|
||||
if skip_local and status_code == scraper.http_code_local :
|
||||
last_performer_id = curr_id
|
||||
continue
|
||||
if soup:
|
||||
@ -272,11 +272,11 @@ def fetch_performers_detail_once(perfomers_list):
|
||||
})
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=2)
|
||||
elif status_code and status_code == scraper.http_code_404:
|
||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_404)
|
||||
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||
elif status_code and status_code == 601:
|
||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=3)
|
||||
elif status_code and status_code == scraper.http_code_url:
|
||||
performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_url)
|
||||
logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
@ -293,7 +293,7 @@ def fetch_performers_detail():
|
||||
# 获取新演员的列表
|
||||
while True:
|
||||
if force: # 从头逐个遍历
|
||||
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
|
||||
perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
|
||||
else: # 只做更新
|
||||
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
|
||||
if len(perfomers_list) < 1:
|
||||
@ -322,7 +322,7 @@ def fetch_movies_detail():
|
||||
last_movie_id = 0
|
||||
while True:
|
||||
if force: # 从头逐个遍历
|
||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
|
||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
|
||||
else: # 只做更新
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||
if len(movies_list) < 1:
|
||||
@ -336,8 +336,9 @@ def fetch_movies_detail():
|
||||
logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
# 从本地读取的文件,忽略
|
||||
if skip_local and status_code == 99 :
|
||||
if skip_local and status_code == scraper.http_code_local :
|
||||
last_movie_id = curr_id
|
||||
succ_count += 1
|
||||
continue
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
@ -359,13 +360,13 @@ def fetch_movies_detail():
|
||||
utils.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
elif status_code and status_code == scraper.http_code_404:
|
||||
# 标记为已处理
|
||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2)
|
||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
|
||||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||
elif status_code and status_code == 601:
|
||||
elif status_code and status_code == scraper.http_code_url:
|
||||
# 标记为已处理
|
||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3)
|
||||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_url)
|
||||
logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
|
||||
@ -36,6 +36,11 @@ headers = {
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
http_code_404 = 404
|
||||
http_code_login = 401
|
||||
http_code_url = 601
|
||||
http_code_local = 99
|
||||
|
||||
save_raw_html = True
|
||||
load_from_local = True
|
||||
|
||||
@ -49,27 +54,27 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的
|
||||
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if host_url not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None, 601
|
||||
return None, http_code_url
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.debug(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 过期的网页,与404相同处理
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
logging.debug(f"invalid or outdated page: {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
if save_raw_html:
|
||||
utils.write_raw_html(url, response.text)
|
||||
|
||||
Reference in New Issue
Block a user