modify scripts
This commit is contained in:
@ -79,32 +79,42 @@ def fetch_actor_list():
|
||||
#for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang)
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_maker():
|
||||
# 从studio/label/series中获取影片
|
||||
def fetch_movies_common(tbl):
|
||||
if debug:
|
||||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||
url_list = db_tools.query_list_common(tbl=tbl)
|
||||
else:
|
||||
if g_uncensored==1:
|
||||
url_list = db_tools.query_maker_hrefs(from_list=1)
|
||||
url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
|
||||
elif g_uncensored==0:
|
||||
url_list = db_tools.query_maker_hrefs(from_list=0)
|
||||
url_list = db_tools.query_list_common(tbl=tbl, uncensored=0)
|
||||
else:
|
||||
url_list = db_tools.query_maker_hrefs()
|
||||
|
||||
url_list = db_tools.query_list_common(tbl=tbl)
|
||||
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
uncensored = row['from_list'] if row['from_list'] > 0 else None
|
||||
uncensored = row['uncensored'] if row['uncensored'] > 0 else None
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
next_url = url
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for maker url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id, uncensored=uncensored)
|
||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
# 根据tbl的值动态构建额外参数
|
||||
extra_kwargs = {}
|
||||
if tbl == 'studio':
|
||||
extra_kwargs = {'from_movie_studios': 1, 'studio_id': row_id}
|
||||
elif tbl == 'label':
|
||||
extra_kwargs = {'from_movie_labels': 1, 'label_id': row_id}
|
||||
elif tbl == 'series':
|
||||
extra_kwargs = {'from_movie_series': 1, 'series_id': row_id}
|
||||
extra_kwargs['uncensored'] = uncensored
|
||||
|
||||
for movie in list_data.get('movies', []):
|
||||
tmp_id = db_tools.insert_movie_index({'title':movie['title'], 'href':movie['href']}, **extra_kwargs)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
@ -120,85 +130,106 @@ def fetch_movies_by_maker():
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_studio():
|
||||
fetch_movies_common('studio')
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_label():
|
||||
fetch_movies_common('label')
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_series():
|
||||
fetch_movies_common('series')
|
||||
|
||||
# 从studio/label/series中获取影片
|
||||
def update_multilang_common(tbl):
|
||||
if debug:
|
||||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||
url_list = db_tools.query_list_common(tbl=tbl, limit=3)
|
||||
else:
|
||||
if g_uncensored == 1:
|
||||
url_list = db_tools.query_series_hrefs(from_list=1)
|
||||
elif g_uncensored == 0:
|
||||
url_list = db_tools.query_series_hrefs(from_list=0)
|
||||
if g_uncensored==1:
|
||||
url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
|
||||
elif g_uncensored==0:
|
||||
url_list = db_tools.query_list_common(tbl=tbl, uncensored=0)
|
||||
else:
|
||||
url_list = db_tools.query_series_hrefs()
|
||||
url_list = db_tools.query_list_common(tbl=tbl)
|
||||
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
uncensored = row['from_list'] if row['from_list'] > 0 else None
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for series url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id, uncensored=uncensored)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
if not utils.is_valid_url(url):
|
||||
logging.info(f'invalid url {url} in {tbl}, skipping...')
|
||||
continue
|
||||
langs_url = utils.generate_multilang_urls(url)
|
||||
for lang, next_url in langs_url.items():
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for url {next_url} ..., raw url: {url}")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
lang_meta = list_data.get('meta', {})
|
||||
if lang_meta.get('title') is not None:
|
||||
lang_meta['href'] = url
|
||||
lang_meta[f'{lang}_name'] = lang_meta.get('title')
|
||||
tmp_id = db_tools.update_pubs_multilang(lang_meta, tbl)
|
||||
if tmp_id:
|
||||
logging.debug(f'update pubs multi lang. data: {lang_meta}')
|
||||
else:
|
||||
logging.warning(f'update pubs multi lang failed. data: {lang_meta}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
# 不要翻页,获取首页的即可
|
||||
break
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_publishers():
|
||||
def update_multi_langs():
|
||||
update_multilang_common('studio')
|
||||
update_multilang_common('label')
|
||||
update_multilang_common('series')
|
||||
|
||||
# 从studio/label/series中获取影片
|
||||
def update_multilang_tags():
|
||||
if debug:
|
||||
url_list = db_tools.query_publishers_hrefs(limit=1)
|
||||
url_list = db_tools.query_tags(limit=5)
|
||||
else:
|
||||
if g_uncensored == 1:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=1)
|
||||
elif g_uncensored == 0:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=0)
|
||||
else:
|
||||
url_list = db_tools.query_publishers_hrefs()
|
||||
url_list = db_tools.query_tags()
|
||||
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for publisher url {next_url} ...")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_publisher_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
if not utils.is_valid_url(url):
|
||||
logging.info(f'invalid url {url}, skipping...')
|
||||
continue
|
||||
langs_url = utils.generate_multilang_urls(url)
|
||||
for lang, next_url in langs_url.items():
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for url {next_url} ..., raw url: {url}")
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
lang_meta = list_data.get('meta', {})
|
||||
if lang_meta.get('title') is not None:
|
||||
lang_meta['href'] = url
|
||||
lang_meta[f'{lang}_name'] = lang_meta.get('title')
|
||||
tmp_id = db_tools.update_tags(lang_meta)
|
||||
if tmp_id:
|
||||
logging.debug(f'update tags multi lang. data: {lang_meta}')
|
||||
else:
|
||||
logging.warning(f'update tags multi lang failed. data: {lang_meta}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
# 不要翻页,获取首页的即可
|
||||
break
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
@ -376,11 +407,13 @@ def fetch_movies_detail():
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"actor_list": fetch_actor_list,
|
||||
"makers": fetch_movies_by_maker,
|
||||
"studio" : fetch_movies_by_studio,
|
||||
"series" : fetch_movies_by_series,
|
||||
"pub" : fetch_movies_by_publishers,
|
||||
"labels" : fetch_movies_by_label,
|
||||
"actors" : fetch_performers_detail,
|
||||
"movies" : fetch_movies_detail,
|
||||
"langs" : update_multi_langs,
|
||||
"tags" : update_multilang_tags,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
@ -415,7 +448,7 @@ def main(cmd, args):
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1,
|
||||
# 1, tags 和 studio / label / series 的多语言
|
||||
|
||||
# 设置环境变量
|
||||
def set_env(args):
|
||||
|
||||
Reference in New Issue
Block a user