modify scripts

This commit is contained in:
oscarz
2025-06-24 19:03:44 +08:00
parent c5feab2c22
commit 7e14a5f247
4 changed files with 610 additions and 226 deletions

View File

@ -246,13 +246,13 @@ def fetch_performers_detail():
limit_count = 5 if debug else 100
performers_list = []
last_performer_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_performers(**kwargs):
if scan_mode == 1:
kwargs["from_actor_list"] = 1
kwargs["uncensored"] = 1
elif scan_mode == 0:
kwargs["from_actor_list"] = 0
kwargs["uncensored"] = 0
else:
logging.debug(f"scan all records")
kwargs["order_by"] = 'id asc'
@ -278,29 +278,29 @@ def fetch_performers_detail():
for performer in performers_list:
url = performer['href']
person = performer['name']
pic = ''
alias = []
uncensored = int(performer['uncensored'])
avatar = None
next_url = url
all_movies = []
need_insert = True
while next_url:
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="alert alert-success alert-common", attr_type="class"))
if soup:
data, next_url = scraper.parse_actor_detail(soup, next_url)
if data:
pic = data.get('pic', '')
alias = data.get('alias', [])
if not avatar:
avatar = data.get('avatar')
all_movies.extend(data.get('movies', []))
elif status_code and status_code == scraper.http_code_404:
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404)
elif status_code and status_code == craw.http_code_404:
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404})
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
elif status_code and status_code == scraper.http_code_login:
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login)
elif status_code and status_code == craw.http_code_redirect:
actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
need_insert = False
break
@ -311,16 +311,20 @@ def fetch_performers_detail():
if not need_insert:
continue
#utils.pretty_print_json(avatar)
#utils.pretty_print_json(all_movies)
#continue
# 获取完了个人的所有影片,开始插入数据
performer_id = db_tools.insert_or_update_actor({
performer_id = db_tools.update_actor_detail({
'href': url,
'name': person,
'pic' : pic,
'alias' : alias,
'credits':all_movies
'avatar': avatar,
'credits':all_movies,
'uncensored':uncensored
})
if performer_id:
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
logging.debug(f'insert/update one person, id: {performer_id}, person: ({person}), url: {url}')
last_performer_id = performer_id
succ_rows += 1
else:
@ -334,10 +338,10 @@ def fetch_performers_detail():
# 更新影片信息
def fetch_movies_detail():
limit_count = 10 if debug else 100
limit_count = 2 if debug else 100
movies_list = []
last_movie_id = 0
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
def get_movies(**kwargs):
if scan_mode == 1:
@ -347,7 +351,7 @@ def fetch_movies_detail():
else:
logging.debug(f"scan all records.")
kwargs["order_by"] = 'id asc'
return db_tools.query_movie_hrefs(limit=limit_count, **kwargs)
return db_tools.query_movies(limit=limit_count, **kwargs)
while True:
if update_mode == 0: # 只遍历新纪录
@ -370,10 +374,11 @@ def fetch_movies_detail():
url = movie['href']
title = movie['title']
curr_id = movie['id']
uncensored = int(movie['uncensored'])
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="container", attr_type="class"))
# 从本地读取的文件,忽略
if skip_local and status_code == scraper.http_code_local :
if skip_local and status_code == craw.http_code_local :
last_movie_id = curr_id
succ_count += 1
continue
@ -381,6 +386,9 @@ def fetch_movies_detail():
if soup:
movie_data = scraper.parse_movie_detail(soup, url, title)
if movie_data :
#utils.pretty_print_json(movie_data)
#continue
movie_data['uncensored'] = uncensored
movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
@ -391,11 +399,11 @@ def fetch_movies_detail():
else:
logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == scraper.http_code_404:
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
elif status_code and status_code == craw.http_code_404:
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
elif status_code and status_code == scraper.http_code_login:
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login)
elif status_code and status_code == craw.http_code_redirect:
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_redirect})
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else:
logging.warning(f'fetch_page error. url: {url}')