modify scripts
This commit is contained in:
@ -61,7 +61,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
|||||||
if debug:
|
if debug:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# 获取演员列表
|
# 获取演员列表,控制逻辑,多语言
|
||||||
def fetch_actor_list():
|
def fetch_actor_list():
|
||||||
if g_uncensored == 1:
|
if g_uncensored == 1:
|
||||||
for lang in ["en", "ja", "zh"]:
|
for lang in ["en", "ja", "zh"]:
|
||||||
@ -95,10 +95,13 @@ def fetch_movies_common(tbl):
|
|||||||
url = row['href']
|
url = row['href']
|
||||||
row_id = row['id']
|
row_id = row['id']
|
||||||
uncensored = row['uncensored'] if row['uncensored'] > 0 else None
|
uncensored = row['uncensored'] if row['uncensored'] > 0 else None
|
||||||
|
if not utils.is_valid_url(url):
|
||||||
|
logging.info(f'invalid url {url} in {tbl}, skipping...')
|
||||||
|
continue
|
||||||
# 去掉可下载的标志(如果有)
|
# 去掉可下载的标志(如果有)
|
||||||
next_url = url
|
next_url = url
|
||||||
while next_url:
|
while next_url:
|
||||||
logging.info(f"Fetching data for maker url {next_url} ...")
|
logging.info(f"Fetching data from {tbl} url {next_url} ...")
|
||||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||||
if soup:
|
if soup:
|
||||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||||
@ -125,6 +128,10 @@ def fetch_movies_common(tbl):
|
|||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == 404:
|
||||||
logging.warning(f"fetch page error. httpcode: {status_code}, url: {next_url}")
|
logging.warning(f"fetch page error. httpcode: {status_code}, url: {next_url}")
|
||||||
break
|
break
|
||||||
|
else: # 达到失败上限,加上休眠继续重试
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
# 调试增加brak
|
# 调试增加brak
|
||||||
if debug:
|
if debug:
|
||||||
@ -142,7 +149,7 @@ def fetch_movies_by_label():
|
|||||||
def fetch_movies_by_series():
|
def fetch_movies_by_series():
|
||||||
fetch_movies_common('series')
|
fetch_movies_common('series')
|
||||||
|
|
||||||
# 从studio/label/series中获取影片
|
# 从studio/label/series首页获取他们的多语言表述
|
||||||
def update_multilang_common(tbl):
|
def update_multilang_common(tbl):
|
||||||
if debug:
|
if debug:
|
||||||
url_list = db_tools.query_list_common(tbl=tbl, limit=3)
|
url_list = db_tools.query_list_common(tbl=tbl, limit=3)
|
||||||
@ -162,7 +169,7 @@ def update_multilang_common(tbl):
|
|||||||
langs_url = utils.generate_multilang_urls(url)
|
langs_url = utils.generate_multilang_urls(url)
|
||||||
for lang, next_url in langs_url.items():
|
for lang, next_url in langs_url.items():
|
||||||
while next_url:
|
while next_url:
|
||||||
logging.info(f"Fetching data for url {next_url} ..., raw url: {url}")
|
logging.info(f"Fetching data for url {next_url} ...")
|
||||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||||
if soup:
|
if soup:
|
||||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||||
@ -186,13 +193,21 @@ def update_multilang_common(tbl):
|
|||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||||
break
|
break
|
||||||
|
|
||||||
|
else: # 达到失败上限,加上休眠继续重试
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
# 更新series列表中的影片信息
|
# 更新series列表中的影片信息
|
||||||
def update_multi_langs():
|
def update_multi_langs():
|
||||||
update_multilang_common('studio')
|
update_multilang_common('studio')
|
||||||
update_multilang_common('label')
|
update_multilang_common('label')
|
||||||
update_multilang_common('series')
|
update_multilang_common('series')
|
||||||
|
|
||||||
# 从studio/label/series中获取影片
|
# 获取影片tags的多语言表述
|
||||||
def update_multilang_tags():
|
def update_multilang_tags():
|
||||||
if debug:
|
if debug:
|
||||||
url_list = db_tools.query_tags(limit=5)
|
url_list = db_tools.query_tags(limit=5)
|
||||||
@ -231,6 +246,13 @@ def update_multilang_tags():
|
|||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||||
break
|
break
|
||||||
|
|
||||||
|
else: # 达到失败上限,加上休眠继续重试
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
time.sleep(0.3)
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
# 更新演员信息
|
# 更新演员信息
|
||||||
def fetch_performers_detail():
|
def fetch_performers_detail():
|
||||||
limit_count = 5 if debug else 100
|
limit_count = 5 if debug else 100
|
||||||
@ -296,6 +318,7 @@ def fetch_performers_detail():
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
# 如果出现了401或者404,已经处理,直接跳过
|
# 如果出现了401或者404,已经处理,直接跳过
|
||||||
if not need_insert:
|
if not need_insert:
|
||||||
@ -388,6 +411,7 @@ def fetch_movies_detail():
|
|||||||
logging.warning(f'insert movie {url} failed.')
|
logging.warning(f'insert movie {url} failed.')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
elif status_code and status_code == craw.http_code_404:
|
elif status_code and status_code == craw.http_code_404:
|
||||||
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
|
movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})
|
||||||
|
|||||||
Reference in New Issue
Block a user