modify scripts
This commit is contained in:
@ -13,11 +13,9 @@ import utils
|
||||
config.setup_logging()
|
||||
|
||||
debug = False
|
||||
force = False
|
||||
skip_local = False
|
||||
from_actor = False
|
||||
abnormal_only = False
|
||||
fast_mode = False
|
||||
update_mode = 0
|
||||
|
||||
# 获取演员列表
|
||||
def fetch_actor_list():
|
||||
@ -90,16 +88,18 @@ def fetch_series_list():
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_maker():
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_maker_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_maker_hrefs()
|
||||
|
||||
if debug:
|
||||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||
else:
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_maker_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_maker_hrefs()
|
||||
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
uncensored = row['from_list'] if row['from_list'] > 0 else None
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
@ -109,7 +109,7 @@ def fetch_movies_by_maker():
|
||||
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id)
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id, uncensored=uncensored)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
@ -127,16 +127,18 @@ def fetch_movies_by_maker():
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_series():
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_series_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_series_hrefs()
|
||||
|
||||
if debug:
|
||||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||
else:
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_series_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_series_hrefs()
|
||||
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
uncensored = row['from_list'] if row['from_list'] > 0 else None
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
@ -146,7 +148,7 @@ def fetch_movies_by_series():
|
||||
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id)
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id, uncensored=uncensored)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
@ -163,13 +165,14 @@ def fetch_movies_by_series():
|
||||
|
||||
# 更新series列表中的影片信息
|
||||
def fetch_movies_by_publishers():
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_publishers_hrefs()
|
||||
|
||||
if debug:
|
||||
url_list = db_tools.query_publishers_hrefs(limit=1)
|
||||
else:
|
||||
if fast_mode:
|
||||
url_list = db_tools.query_publishers_hrefs(from_list=1)
|
||||
else:
|
||||
url_list = db_tools.query_publishers_hrefs()
|
||||
|
||||
for row in url_list:
|
||||
url = row['href']
|
||||
row_id = row['id']
|
||||
@ -201,30 +204,34 @@ def fetch_movies_by_publishers():
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
limit_count = 5 if debug else 100
|
||||
perfomers_list = []
|
||||
last_perfomer_id = 0
|
||||
performers_list = []
|
||||
last_performer_id = 0
|
||||
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||||
|
||||
def get_performers(**kwargs):
|
||||
if fast_mode:
|
||||
kwargs["from_actor_list"] = 1
|
||||
kwargs["order_by"] = 'id asc'
|
||||
return db_tools.query_actors(limit=limit_count, **kwargs)
|
||||
|
||||
while True:
|
||||
# 每次从数据库中取一部分,避免一次全量获取
|
||||
if force: # 从头逐个遍历
|
||||
if from_actor:
|
||||
if abnormal_only:
|
||||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||
else:
|
||||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||
else:
|
||||
if abnormal_only:
|
||||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||
else:
|
||||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||
else: # 只做更新
|
||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
|
||||
if len(perfomers_list) < 1:
|
||||
if update_mode == 0: # 只遍历新纪录
|
||||
performers_list = get_performers(start_id=0, is_full_data=0)
|
||||
elif update_mode == 1: # 只遍历完整纪录
|
||||
performers_list = get_performers(start_id=last_performer_id, is_full_data=1)
|
||||
elif update_mode == 2: # 0+1
|
||||
performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes)
|
||||
elif update_mode == 3: # 其他
|
||||
performers_list = get_performers(start_id=last_performer_id, is_full_data_in =abnormal_codes)
|
||||
else: # 全部
|
||||
performers_list = get_performers(start_id=last_performer_id)
|
||||
|
||||
if len(performers_list) < 1:
|
||||
logging.info(f'all performers fetched.')
|
||||
break
|
||||
|
||||
succ_rows = 0
|
||||
for performer in perfomers_list:
|
||||
for performer in performers_list:
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
pic = ''
|
||||
@ -249,7 +256,7 @@ def fetch_performers_detail():
|
||||
need_insert = False
|
||||
break
|
||||
elif status_code and status_code == scraper.http_code_login:
|
||||
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login)
|
||||
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login)
|
||||
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||||
need_insert = False
|
||||
break
|
||||
@ -270,13 +277,13 @@ def fetch_performers_detail():
|
||||
})
|
||||
if performer_id:
|
||||
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||
last_perfomer_id = performer_id
|
||||
last_performer_id = performer_id
|
||||
succ_rows += 1
|
||||
else:
|
||||
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||
time.sleep(0.5)
|
||||
|
||||
logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}')
|
||||
logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}')
|
||||
# 调试break
|
||||
if debug:
|
||||
return True
|
||||
@ -287,23 +294,29 @@ def fetch_movies_detail():
|
||||
movies_list = []
|
||||
last_movie_id = 0
|
||||
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||||
while True:
|
||||
if force: # 从头逐个遍历
|
||||
if from_actor:
|
||||
if abnormal_only:
|
||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||
else:
|
||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||||
else:
|
||||
if abnormal_only:
|
||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||
else:
|
||||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||||
else: # 只做更新
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||||
|
||||
def get_movies(**kwargs):
|
||||
if fast_mode:
|
||||
kwargs["uncensored"] = 1
|
||||
kwargs["order_by"] = 'id asc'
|
||||
return db_tools.query_movie_hrefs(limit=limit_count, **kwargs)
|
||||
|
||||
while True:
|
||||
if update_mode == 0: # 只遍历新纪录
|
||||
movies_list = get_movies(start_id=0, is_full_data=0)
|
||||
elif update_mode == 1: # 只遍历完整纪录
|
||||
movies_list = get_movies(start_id=last_movie_id, is_full_data=1)
|
||||
elif update_mode == 2: # 0+1
|
||||
movies_list = get_movies(start_id=last_movie_id, is_full_data_not_in=abnormal_codes)
|
||||
elif update_mode == 3: # 其他
|
||||
movies_list = get_movies(start_id=last_movie_id, is_full_data_in =abnormal_codes)
|
||||
else: # 全部
|
||||
movies_list = get_movies(start_id=last_movie_id)
|
||||
|
||||
if len(movies_list) < 1:
|
||||
logging.info(f'all movies fetched.')
|
||||
logging.info(f'all performers fetched.')
|
||||
break
|
||||
|
||||
succ_count = 0
|
||||
for movie in movies_list:
|
||||
url = movie['href']
|
||||
@ -399,33 +412,26 @@ def set_env(args):
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
global force
|
||||
force = args.force
|
||||
|
||||
global skip_local
|
||||
skip_local = args.skip_local
|
||||
|
||||
global from_actor
|
||||
from_actor = args.from_actor
|
||||
|
||||
global abnormal_only
|
||||
abnormal_only = args.abnormal_only
|
||||
|
||||
global fast_mode
|
||||
fast_mode = args.fast_mode
|
||||
|
||||
global update_mode
|
||||
if args.update:
|
||||
update_mode = args.update
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0, 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
|
||||
parser.add_argument('--fast_mode', action='store_true', help='只遍历所有 uncensored 的 makers/series/actors/movies')
|
||||
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
|
||||
parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)')
|
||||
parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)')
|
||||
parser.add_argument('--fast_mode', action='store_true', help='只遍历所有 uncensored 的 makers 和 series ')
|
||||
args = parser.parse_args()
|
||||
|
||||
set_env(args)
|
||||
|
||||
Reference in New Issue
Block a user