modify scripts
This commit is contained in:
@ -208,17 +208,16 @@ class JavbusDBHandler(DatabaseHandler):
|
||||
def update_actor_detail(self, data, is_full_data=1):
|
||||
try:
|
||||
# 跟新actor表
|
||||
if data.get('avatar') is not None:
|
||||
avatar = data.get('avatar', {})
|
||||
avatar['href'] = data['href']
|
||||
avatar['is_full_data'] = is_full_data
|
||||
|
||||
avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href')
|
||||
logging.debug(f"update actor data. data: {avatar}")
|
||||
else:
|
||||
avatar_id = self.get_id_by_key(self.tbl_name_actors, 'href', data.get('href', ''))
|
||||
if not avatar_id:
|
||||
logging.warning(f"get actor id error. href: {data['href']}")
|
||||
return None
|
||||
else:
|
||||
logging.debug(f"update actor data. href: {data['href']} avatar: {avatar}")
|
||||
|
||||
# 更新movies表
|
||||
uncensored = data.get('uncensored', 0)
|
||||
@ -393,11 +392,12 @@ class JavbusDBHandler(DatabaseHandler):
|
||||
logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}")
|
||||
|
||||
# 插入 performers_movies 关系表
|
||||
uncensored = movie.get('uncensored', 0)
|
||||
for performer in movie.get('actors', []):
|
||||
performer_id = self.get_id_by_key(self.tbl_name_actors, 'href', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = self.insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
|
||||
performer_id = self.insert_actor_index({'zh_name': performer['name'], 'href':performer['href']}, uncensored=uncensored, from_movie_list=1)
|
||||
logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}")
|
||||
if performer_id:
|
||||
tmp_id = self.insert_actor_movie(performer_id, movie_id)
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
@ -19,7 +18,7 @@ scraper = craw.JavbusCrawler()
|
||||
|
||||
debug = False
|
||||
skip_local = False
|
||||
scan_mode = 0
|
||||
g_uncensored = 0
|
||||
update_mode = 0
|
||||
|
||||
# 获取演员列表
|
||||
@ -32,7 +31,6 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
||||
s_url = f"/{lang}/actresses" if lang != 'zh' else f"/actresses"
|
||||
|
||||
current_url = urljoin(scraper.host_url, s_url)
|
||||
num = 1
|
||||
while current_url:
|
||||
logging.info(f"fetching url {current_url}")
|
||||
soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
@ -65,69 +63,30 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
|
||||
|
||||
# 获取演员列表
|
||||
def fetch_actor_list():
|
||||
#for lang in ["en", "ja", "zh"]:
|
||||
for lang in ['en']:
|
||||
if g_uncensored == 1:
|
||||
for lang in ["en", "ja", "zh"]:
|
||||
#for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang, uncensored=1)
|
||||
|
||||
#for lang in ["en", "ja", "zh"]:
|
||||
for lang in ['en']:
|
||||
elif g_uncensored ==0:
|
||||
for lang in ["en", "ja", "zh"]:
|
||||
#for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang)
|
||||
|
||||
# 获取makers列表
|
||||
def fetch_makers_list():
|
||||
next_url = scraper.makers_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
maker_id = db_tools.insert_or_update_makers(row, caller='list')
|
||||
if maker_id:
|
||||
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取series列表
|
||||
def fetch_series_list():
|
||||
next_url = scraper.series_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
maker_id = db_tools.insert_or_update_series(row, caller='list')
|
||||
if maker_id:
|
||||
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
for lang in ["en", "ja", "zh"]:
|
||||
#for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang, uncensored=1)
|
||||
for lang in ["en", "ja", "zh"]:
|
||||
#for lang in ['en']:
|
||||
fetch_actor_list_lang(lang=lang)
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_maker():
|
||||
if debug:
|
||||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||
else:
|
||||
if scan_mode==1:
|
||||
if g_uncensored==1:
|
||||
url_list = db_tools.query_maker_hrefs(from_list=1)
|
||||
elif scan_mode==0:
|
||||
elif g_uncensored==0:
|
||||
url_list = db_tools.query_maker_hrefs(from_list=0)
|
||||
else:
|
||||
url_list = db_tools.query_maker_hrefs()
|
||||
@ -166,9 +125,9 @@ def fetch_movies_by_series():
|
||||
if debug:
|
||||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||
else:
|
||||
if scan_mode == 1:
|
||||
if g_uncensored == 1:
|
||||
url_list = db_tools.query_series_hrefs(from_list=1)
|
||||
elif scan_mode == 0:
|
||||
elif g_uncensored == 0:
|
||||
url_list = db_tools.query_series_hrefs(from_list=0)
|
||||
else:
|
||||
url_list = db_tools.query_series_hrefs()
|
||||
@ -206,9 +165,9 @@ def fetch_movies_by_publishers():
|
||||
if debug:
|
||||
url_list = db_tools.query_publishers_hrefs(limit=1)
|
||||
else:
|
||||
if scan_mode == 1:
|
||||
if g_uncensored == 1:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=1)
|
||||
elif scan_mode == 0:
|
||||
elif g_uncensored == 0:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=0)
|
||||
else:
|
||||
url_list = db_tools.query_publishers_hrefs()
|
||||
@ -249,9 +208,9 @@ def fetch_performers_detail():
|
||||
abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
|
||||
|
||||
def get_performers(**kwargs):
|
||||
if scan_mode == 1:
|
||||
if g_uncensored == 1:
|
||||
kwargs["uncensored"] = 1
|
||||
elif scan_mode == 0:
|
||||
elif g_uncensored == 0:
|
||||
kwargs["uncensored"] = 0
|
||||
else:
|
||||
logging.debug(f"scan all records")
|
||||
@ -344,9 +303,9 @@ def fetch_movies_detail():
|
||||
abnormal_codes = [craw.http_code_404, craw.http_code_redirect]
|
||||
|
||||
def get_movies(**kwargs):
|
||||
if scan_mode == 1:
|
||||
if g_uncensored == 1:
|
||||
kwargs["uncensored"] = 1
|
||||
elif scan_mode == 0:
|
||||
elif g_uncensored == 0:
|
||||
kwargs["uncensored"] = 0
|
||||
else:
|
||||
logging.debug(f"scan all records.")
|
||||
@ -417,8 +376,6 @@ def fetch_movies_detail():
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"actor_list": fetch_actor_list,
|
||||
"maker_list": fetch_makers_list,
|
||||
"series_list": fetch_series_list,
|
||||
"makers": fetch_movies_by_maker,
|
||||
"series" : fetch_movies_by_series,
|
||||
"pub" : fetch_movies_by_publishers,
|
||||
@ -471,8 +428,8 @@ def set_env(args):
|
||||
global skip_local
|
||||
skip_local = args.skip_local
|
||||
|
||||
global scan_mode
|
||||
scan_mode = args.scan_mode
|
||||
global g_uncensored
|
||||
g_uncensored = args.uncensored
|
||||
|
||||
global update_mode
|
||||
if args.update:
|
||||
@ -485,13 +442,13 @@ if __name__ == "__main__":
|
||||
usage_examples = textwrap.dedent('''
|
||||
示例用法:
|
||||
python3 ./fetch.py # 遍历新增的所有记录
|
||||
python3 ./fetch.py --scan_mode=1 # 遍历新增的 uncensored 记录(无码片)
|
||||
python3 ./fetch.py --scan_mode=0 # 遍历新增的 非uncensored 记录(有码片)
|
||||
python3 ./fetch.py --scan_mode=2 # 遍历所有新增
|
||||
python3 ./fetch.py --uncensored=1 # 遍历新增的 uncensored 记录(无码片)
|
||||
python3 ./fetch.py --uncensored=0 # 遍历新增的 非uncensored 记录(有码片)
|
||||
python3 ./fetch.py --uncensored=2 # 遍历所有新增
|
||||
python3 ./fetch.py --update=4 # 遍历全量的记录
|
||||
python3 ./fetch.py --update=4 --scan_mode=1 # 遍历全量的 uncensored 记录(无码片)
|
||||
python3 ./fetch.py --update=4 --scan_mode=0 # 遍历全量的 非uncensored 记录(有码片)
|
||||
python3 ./fetch.py --update=4 --scan_mode=2 # 遍历全量记录
|
||||
python3 ./fetch.py --update=4 --uncensored=1 # 遍历全量的 uncensored 记录(无码片)
|
||||
python3 ./fetch.py --update=4 --uncensored=0 # 遍历全量的 非uncensored 记录(有码片)
|
||||
python3 ./fetch.py --update=4 --uncensored=2 # 遍历全量记录
|
||||
''')
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
@ -501,7 +458,7 @@ if __name__ == "__main__":
|
||||
#parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
|
||||
parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
|
||||
parser.add_argument('--uncensored', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
|
||||
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
args = parser.parse_args()
|
||||
|
||||
Reference in New Issue
Block a user