modify scripts

This commit is contained in:
oscarz
2025-04-23 17:43:30 +08:00
parent f6385b83e4
commit 2e2218b623
2 changed files with 50 additions and 5 deletions

View File

@ -160,6 +160,8 @@ def fetch_performers_detail():
if len(perfomers_list) < 1:
logging.info(f'all performers fetched.')
break
succ_rows = 0
for performer in perfomers_list:
url = performer['href']
person = performer['name']
@ -168,8 +170,9 @@ def fetch_performers_detail():
next_url = url
all_movies = []
need_insert = True
while next_url:
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url)
@ -179,11 +182,22 @@ def fetch_performers_detail():
all_movies.extend(data.get('movies', []))
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2)
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == 401:
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3)
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
logging.warning(f'fetch_page error. url: {url}')
# 如果出现了401或者404已经处理直接跳过
if not need_insert:
continue
# 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.insert_or_update_actor({
'href': url,
@ -193,11 +207,14 @@ def fetch_performers_detail():
'credits':all_movies
})
if performer_id:
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
last_perfomer_id = performer_id
succ_rows += 1
else:
logging.warning(f'insert person: ({person}) {url} failed.')
time.sleep(0.5)
logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}')
# 调试break
if debug:
return True