392 lines
17 KiB
Python
392 lines
17 KiB
Python
|
||
import json
|
||
import time
|
||
import csv
|
||
import argparse
|
||
import logging
|
||
from functools import partial
|
||
import config
|
||
import sqlite_utils as db_tools
|
||
import scraper
|
||
import utils
|
||
|
||
config.setup_logging()
|
||
|
||
debug = False
|
||
force = False
|
||
skip_local = False
|
||
from_actor = False
|
||
abnormal_only = False
|
||
fast_mode = False
|
||
|
||
# 获取演员列表
|
||
def fetch_actor_list():
|
||
next_url = scraper.actors_uncensored_base_url
|
||
while next_url:
|
||
logging.info(f'fetching page {next_url}')
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||
if soup:
|
||
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
||
if list_data :
|
||
# 写入数据库
|
||
for row in list_data:
|
||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
|
||
if actor_id:
|
||
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
||
else:
|
||
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
||
else:
|
||
logging.warning(f'fetch actor error. {next_url} ...')
|
||
elif status_code and status_code == 404:
|
||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||
break
|
||
|
||
# 获取makers列表
|
||
def fetch_makers_list():
|
||
next_url = scraper.makers_uncensored_base_url
|
||
while next_url:
|
||
logging.info(f'fetching page {next_url}')
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||
if soup:
|
||
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
|
||
if list_data :
|
||
# 写入数据库
|
||
for row in list_data:
|
||
maker_id = db_tools.insert_or_update_makers(row, caller='list')
|
||
if maker_id:
|
||
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||
else:
|
||
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
|
||
else:
|
||
logging.warning(f'fetch actor error. {next_url} ...')
|
||
|
||
elif status_code and status_code == 404:
|
||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||
break
|
||
|
||
# 获取series列表
|
||
def fetch_series_list():
|
||
next_url = scraper.series_uncensored_base_url
|
||
while next_url:
|
||
logging.info(f'fetching page {next_url}')
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||
if soup:
|
||
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
|
||
if list_data :
|
||
# 写入数据库
|
||
for row in list_data:
|
||
maker_id = db_tools.insert_or_update_series(row, caller='list')
|
||
if maker_id:
|
||
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||
else:
|
||
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
|
||
else:
|
||
logging.warning(f'fetch actor error. {next_url} ...')
|
||
|
||
elif status_code and status_code == 404:
|
||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||
break
|
||
|
||
|
||
# 更新makers列表中的影片信息
|
||
def fetch_movies_by_maker():
|
||
if fast_mode:
|
||
url_list = db_tools.query_maker_hrefs(from_list=1)
|
||
else:
|
||
url_list = db_tools.query_maker_hrefs()
|
||
|
||
if debug:
|
||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||
for url in url_list:
|
||
# 去掉可下载的标志(如果有)
|
||
next_url = utils.remove_url_query(url)
|
||
while next_url:
|
||
logging.info(f"Fetching data for maker url {next_url} ...")
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||
if soup:
|
||
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||
if list_data:
|
||
for movie in list_data:
|
||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
|
||
if tmp_id:
|
||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||
else:
|
||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||
else :
|
||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||
|
||
elif status_code and status_code == 404:
|
||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||
break
|
||
|
||
# 调试增加brak
|
||
if debug:
|
||
return True
|
||
|
||
# 更新series列表中的影片信息
|
||
def fetch_movies_by_series():
|
||
if fast_mode:
|
||
url_list = db_tools.query_series_hrefs(from_list=1)
|
||
else:
|
||
url_list = db_tools.query_series_hrefs()
|
||
|
||
if debug:
|
||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||
for url in url_list:
|
||
# 去掉可下载的标志(如果有)
|
||
next_url = utils.remove_url_query(url)
|
||
while next_url:
|
||
logging.info(f"Fetching data for series url {next_url} ...")
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||
if soup:
|
||
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||
if list_data:
|
||
for movie in list_data:
|
||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
|
||
if tmp_id:
|
||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||
else:
|
||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||
else :
|
||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||
elif status_code and status_code == 404:
|
||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||
break
|
||
|
||
# 调试增加brak
|
||
if debug:
|
||
return True
|
||
|
||
|
||
# 更新演员信息
|
||
def fetch_performers_detail():
|
||
limit_count = 5 if debug else 100
|
||
perfomers_list = []
|
||
last_perfomer_id = 0
|
||
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||
while True:
|
||
# 每次从数据库中取一部分,避免一次全量获取
|
||
if force: # 从头逐个遍历
|
||
if from_actor:
|
||
if abnormal_only:
|
||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||
else:
|
||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||
else:
|
||
if abnormal_only:
|
||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||
else:
|
||
perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||
else: # 只做更新
|
||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
|
||
if len(perfomers_list) < 1:
|
||
logging.info(f'all performers fetched.')
|
||
break
|
||
|
||
succ_rows = 0
|
||
for performer in perfomers_list:
|
||
url = performer['href']
|
||
person = performer['name']
|
||
pic = ''
|
||
alias = []
|
||
|
||
next_url = url
|
||
all_movies = []
|
||
need_insert = True
|
||
while next_url:
|
||
logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
|
||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
|
||
if soup:
|
||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||
if data:
|
||
pic = data.get('pic', '')
|
||
alias = data.get('alias', [])
|
||
all_movies.extend(data.get('movies', []))
|
||
|
||
elif status_code and status_code == scraper.http_code_404:
|
||
actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404)
|
||
logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||
need_insert = False
|
||
break
|
||
elif status_code and status_code == scraper.http_code_login:
|
||
actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login)
|
||
logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
|
||
need_insert = False
|
||
break
|
||
else:
|
||
logging.warning(f'fetch_page error. url: {url}')
|
||
|
||
# 如果出现了401或者404,已经处理,直接跳过
|
||
if not need_insert:
|
||
continue
|
||
|
||
# 获取完了个人的所有影片,开始插入数据
|
||
performer_id = db_tools.insert_or_update_actor({
|
||
'href': url,
|
||
'name': person,
|
||
'pic' : pic,
|
||
'alias' : alias,
|
||
'credits':all_movies
|
||
})
|
||
if performer_id:
|
||
logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||
last_perfomer_id = performer_id
|
||
succ_rows += 1
|
||
else:
|
||
logging.warning(f'insert person: ({person}) {url} failed.')
|
||
time.sleep(0.5)
|
||
|
||
logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}')
|
||
# 调试break
|
||
if debug:
|
||
return True
|
||
|
||
# 更新影片信息
|
||
def fetch_movies_detail():
|
||
limit_count = 10 if debug else 100
|
||
movies_list = []
|
||
last_movie_id = 0
|
||
abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
|
||
while True:
|
||
if force: # 从头逐个遍历
|
||
if from_actor:
|
||
if abnormal_only:
|
||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||
else:
|
||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
|
||
else:
|
||
if abnormal_only:
|
||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||
else:
|
||
movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
|
||
else: # 只做更新
|
||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
|
||
if len(movies_list) < 1:
|
||
logging.info(f'all movies fetched.')
|
||
break
|
||
succ_count = 0
|
||
for movie in movies_list:
|
||
url = movie['href']
|
||
title = movie['title']
|
||
curr_id = movie['id']
|
||
logging.debug(f"Fetching data for movie ({title}), url {url} ...")
|
||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||
# 从本地读取的文件,忽略
|
||
if skip_local and status_code == scraper.http_code_local :
|
||
last_movie_id = curr_id
|
||
succ_count += 1
|
||
continue
|
||
# 解析页面,写入数据库
|
||
if soup:
|
||
movie_data = scraper.parse_movie_detail(soup, url, title)
|
||
if movie_data :
|
||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||
if movie_id:
|
||
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||
last_movie_id = movie_id
|
||
succ_count += 1
|
||
else:
|
||
logging.warning(f'insert movie {url} failed.')
|
||
else:
|
||
logging.warning(f'parse_page_movie error. url: {url}')
|
||
|
||
elif status_code and status_code == scraper.http_code_404:
|
||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
|
||
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||
elif status_code and status_code == scraper.http_code_login:
|
||
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login)
|
||
logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
|
||
else:
|
||
logging.warning(f'fetch_page error. url: {url}')
|
||
time.sleep(0.5)
|
||
logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
|
||
# 调试增加break
|
||
if debug:
|
||
return True
|
||
|
||
|
||
# 建立缩写到函数的映射
|
||
function_map = {
|
||
"actor_list": fetch_actor_list,
|
||
"maker_list": fetch_makers_list,
|
||
"series_list": fetch_series_list,
|
||
"makers": fetch_movies_by_maker,
|
||
"series" : fetch_movies_by_series,
|
||
"movies" : fetch_movies_detail,
|
||
"actors" : fetch_performers_detail,
|
||
}
|
||
|
||
# 主函数
|
||
def main(cmd):
|
||
# 开启任务
|
||
task_id = db_tools.insert_task_log()
|
||
if task_id is None:
|
||
logging.warning(f'insert task log error.')
|
||
return None
|
||
|
||
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, skip_local: {skip_local}, cmd: {cmd}')
|
||
|
||
# 执行指定的函数
|
||
if cmd:
|
||
function_names = args.cmd.split(",") # 拆分输入
|
||
for short_name in function_names:
|
||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||
if callable(func):
|
||
db_tools.update_task_log(task_id, task_status=f'Running {short_name}')
|
||
func()
|
||
else:
|
||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||
else: # 全量执行
|
||
for name, func in function_map.items():
|
||
if callable(func):
|
||
db_tools.update_task_log(task_id, task_status=f'Running {name}')
|
||
func()
|
||
else:
|
||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||
|
||
logging.info(f'all process completed!')
|
||
db_tools.finalize_task_log(task_id)
|
||
|
||
# TODO:
|
||
# 1,
|
||
|
||
# 设置环境变量
|
||
def set_env(args):
|
||
global debug
|
||
debug = args.debug
|
||
if debug:
|
||
logger = logging.getLogger()
|
||
logger.setLevel(logging.DEBUG)
|
||
|
||
global force
|
||
force = args.force
|
||
|
||
global skip_local
|
||
skip_local = args.skip_local
|
||
|
||
global from_actor
|
||
from_actor = args.from_actor
|
||
|
||
global abnormal_only
|
||
abnormal_only = args.abnormal_only
|
||
|
||
global fast_mode
|
||
fast_mode = args.fast_mode
|
||
|
||
if __name__ == "__main__":
|
||
# 命令行参数处理
|
||
keys_str = ",".join(function_map.keys())
|
||
|
||
parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||
parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
|
||
parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)')
|
||
parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)')
|
||
parser.add_argument('--fast_mode', action='store_true', help='只遍历所有 uncensored 的 makers 和 series ')
|
||
args = parser.parse_args()
|
||
|
||
set_env(args)
|
||
main(args.cmd)
|