modify scripts
This commit is contained in:
@ -61,7 +61,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 获取演员列表
|
||||
# 获取演员列表,控制逻辑,多语言
|
||||
def fetch_actor_list():
|
||||
if g_uncensored == 1:
|
||||
for lang in ["en", "ja", "zh"]:
|
||||
@ -95,10 +95,13 @@ def fetch_movies_common(tbl):
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
uncensored = row['uncensored'] if row['uncensored'] > 0 else None
|
||||
if not utils.is_valid_url(url):
|
||||
logging.info(f'invalid url {url} in {tbl}, skipping...')
|
||||
continue
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = url
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for maker url {next_url} ...")
|
||||
logging.info(f"Fetching data from {tbl} url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||
@ -125,6 +128,10 @@ def fetch_movies_common(tbl):
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f"fetch page error. httpcode: {status_code}, url: {next_url}")
|
||||
break
|
||||
else: # 达到失败上限,加上休眠继续重试
|
||||
time.sleep(5)
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
@ -142,7 +149,7 @@ def fetch_movies_by_label():
|
||||
def fetch_movies_by_series():
|
||||
fetch_movies_common('series')
|
||||
|
||||
# 从studio/label/series中获取影片
|
||||
# 从studio/label/series首页获取他们的多语言表述
|
||||
def update_multilang_common(tbl):
|
||||
if debug:
|
||||
url_list = db_tools.query_list_common(tbl=tbl, limit=3)
|
||||
@ -162,7 +169,7 @@ def update_multilang_common(tbl):
|
||||
langs_url = utils.generate_multilang_urls(url)
|
||||
for lang, next_url in langs_url.items():
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for url {next_url} ..., raw url: {url}")
|
||||
logging.info(f"Fetching data for url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||
@ -186,13 +193,21 @@ def update_multilang_common(tbl):
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
else: # 达到失败上限,加上休眠继续重试
|
||||
time.sleep(5)
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def update_multi_langs():
|
||||
update_multilang_common('studio')
|
||||
update_multilang_common('label')
|
||||
update_multilang_common('series')
|
||||
|
||||
# 从studio/label/series中获取影片
|
||||
# 获取影片tags的多语言表述
|
||||
def update_multilang_tags():
|
||||
if debug:
|
||||
url_list = db_tools.query_tags(limit=5)
|
||||
@ -231,6 +246,13 @@ def update_multilang_tags():
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
else: # 达到失败上限,加上休眠继续重试
|
||||
time.sleep(5)
|
||||
|
||||
time.sleep(0.3)
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
limit_count = 5 if debug else 100
|
||||
@ -296,6 +318,7 @@ def fetch_performers_detail():
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
time.sleep(2)
|
||||
|
||||
# 如果出现了401或者404,已经处理,直接跳过
|
||||
if not need_insert:
|
||||
@ -388,6 +411,7 @@ def fetch_movies_detail():
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
time.sleep(2)
|
||||
|
||||
elif status_code and status_code == craw.http_code_404:
|
||||
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
|
||||
|
||||
Reference in New Issue
Block a user