modify some scripts.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
/root/hostdir/scripts_data/detail_birth_astro.csv
|
||||
|
@ -1 +0,0 @@
|
||||
/root/hostdir/scripts_data/detail_birth_astro.json
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
merged_birth_astro.json
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
distributors.json
|
||||
@ -21,13 +21,13 @@ def fetch_performers_by_astro():
|
||||
url = scraper.astr_base_url + astro
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
@ -35,6 +35,8 @@ def fetch_performers_by_astro():
|
||||
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
@ -49,19 +51,21 @@ def fetch_performers_by_birth():
|
||||
for day in range(1, 32): # 遍历1到31天
|
||||
url = scraper.birth_base_url.format(month=month, day=day)
|
||||
logging.info(f"Fetching data for birth, url {url}")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
@ -69,32 +73,56 @@ def fetch_performers_by_birth():
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 处理带空格的种族名
|
||||
def format_ethnic(ethnic):
|
||||
return ethnic.replace(' ', '+')
|
||||
# 更新人种列表
|
||||
def fetch_ethic_list():
|
||||
url = scraper.ethnic_list_url
|
||||
logging.info(f"Fetching data for performer's ethnic list, url {url} ...")
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="ethnicity1", attr_type="id"))
|
||||
if soup:
|
||||
list_data = scraper.parse_page_ethnic_list(soup, url)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
dist_id = db_tools.insert_or_update_ethnic({'name': row['name'], 'href': row.get('href', '')})
|
||||
if dist_id:
|
||||
logging.debug(f'insert one record into ethnic table. id:{dist_id}, name: {row['name']}, href:{row.get('href', '')}')
|
||||
else:
|
||||
logging.warning(f'fetch ethnic error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch page error. {url} ...')
|
||||
|
||||
|
||||
# 按人种获取演员列表,有翻页
|
||||
def fetch_performers_by_ethnic():
|
||||
for ethnic in scraper.ethnic_list:
|
||||
url = scraper.ethnic_url + format_ethnic(ethnic)
|
||||
# 先刷新列表
|
||||
fetch_ethic_list()
|
||||
|
||||
ethnic_list = db_tools.query_ethnic_hrefs()
|
||||
for row in ethnic_list:
|
||||
url = row['href']
|
||||
ethnic = row['name']
|
||||
next_url = url
|
||||
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for {ethnic}, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
||||
parser="lxml", preprocessor=scraper.preprocess_html)
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
@ -106,7 +134,7 @@ def fetch_performers_by_ethnic():
|
||||
def fetch_distributors_list():
|
||||
url = scraper.distributors_list_url
|
||||
logging.info(f"Fetching data for distributors list, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
|
||||
if list_data:
|
||||
@ -117,6 +145,8 @@ def fetch_distributors_list():
|
||||
logging.debug(f'insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
@ -124,7 +154,7 @@ def fetch_distributors_list():
|
||||
def fetch_studios_list():
|
||||
url = scraper.studios_list_url
|
||||
logging.info(f"Fetching data for studios list, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
|
||||
if list_data:
|
||||
@ -135,52 +165,68 @@ def fetch_studios_list():
|
||||
logging.debug(f'insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
|
||||
# 更新distributors列表中的影片信息
|
||||
def fetch_movies_by_dist():
|
||||
# 先刷新一下列表
|
||||
fetch_distributors_list()
|
||||
|
||||
url_list = db_tools.query_studio_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
|
||||
for url in url_list:
|
||||
logging.info(f"Fetching data for distributor url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']))
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetching page error. {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新distributors列表中的影片信息
|
||||
def fetch_movies_by_stu():
|
||||
# 先刷新一下列表
|
||||
fetch_studios_list()
|
||||
|
||||
url_list = db_tools.query_studio_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_studio_hrefs(name='vixen.com')
|
||||
for url in url_list:
|
||||
logging.info(f"Fetching data for studio url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']))
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetching page error. {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
@ -198,7 +244,7 @@ def fetch_performers_detail():
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
logging.info(f"Fetching data for performer ({person}), url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
if soup:
|
||||
data = scraper.parse_page_performer(soup)
|
||||
if data:
|
||||
@ -220,6 +266,8 @@ def fetch_performers_detail():
|
||||
})
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
# 调试break
|
||||
@ -238,7 +286,7 @@ def fetch_movies_detail():
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
if movie_data :
|
||||
@ -257,54 +305,59 @@ def fetch_movies_detail():
|
||||
utils.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 获取更新
|
||||
def check_update():
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"astro": fetch_performers_by_astro,
|
||||
"birth": fetch_performers_by_birth,
|
||||
"ethnic": fetch_performers_by_ethnic,
|
||||
"dist" : fetch_movies_by_dist,
|
||||
"stu" : fetch_movies_by_stu,
|
||||
"performers": fetch_performers_detail,
|
||||
"movies" : fetch_movies_detail,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
def main(cmd, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
|
||||
|
||||
# 刷新星座演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching astro list')
|
||||
fetch_performers_by_astro()
|
||||
|
||||
# 刷新生日演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching birth list')
|
||||
fetch_performers_by_birth()
|
||||
|
||||
# 刷新人种演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching ethnic list')
|
||||
fetch_performers_by_ethnic()
|
||||
|
||||
# 刷新distributors列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching distributor list')
|
||||
fetch_distributors_list()
|
||||
|
||||
# 刷新studios列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching studio list')
|
||||
fetch_studios_list()
|
||||
|
||||
# 刷新影片列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by dist')
|
||||
fetch_movies_by_dist()
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by stu')
|
||||
fetch_movies_by_stu()
|
||||
|
||||
# 更新演员信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching performers')
|
||||
fetch_performers_detail()
|
||||
|
||||
# 更新影片信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching movies')
|
||||
fetch_movies_detail()
|
||||
# 执行指定的函数
|
||||
if cmd:
|
||||
function_names = args.cmd.split(",") # 拆分输入
|
||||
for short_name in function_names:
|
||||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
print(f"Warning: {short_name} is not a valid function shortcut.")
|
||||
else: # 全量执行
|
||||
for name, func in function_map.items():
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
print(f"Warning: {name} is not a valid function shortcut.")
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
db_tools.finalize_task_log(task_id)
|
||||
@ -314,80 +367,14 @@ def check_update():
|
||||
# 2, distributors 和 studios 对movie列表的互相检验
|
||||
# 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的
|
||||
|
||||
# 处理本地数据
|
||||
def load_data():
|
||||
# 导入已经在本地的 performers 数据
|
||||
perfomers_file = '../result/detail.json'
|
||||
performers_data = utils.read_json(perfomers_file)
|
||||
if performers_data is None:
|
||||
print(f'read file error.')
|
||||
performers_data = []
|
||||
|
||||
# 读取数据库中的演员列表
|
||||
existed_performer_hrefs = db_tools.query_performer_hrefs()
|
||||
if not existed_performer_hrefs:
|
||||
logging.warning(f'get existed performers from db error.')
|
||||
return None
|
||||
|
||||
for person in performers_data:
|
||||
# 非强制更新下,跳过已有数据
|
||||
if not force and person['href'] in existed_performer_hrefs:
|
||||
continue
|
||||
|
||||
performer_id = db_tools.insert_or_update_performer(person)
|
||||
if performer_id:
|
||||
logging.info(f'insert one person, id: {performer_id}, person: {person['person']}, url: {person['href']}')
|
||||
else:
|
||||
logging.warning(f'insert person: {person['person']}, {person['href']} failed.')
|
||||
logging.info(f'all performers loaded in database.')
|
||||
|
||||
# 导入已经在本地的 movies 数据
|
||||
movies_file = '../result/movie_details.json'
|
||||
movies_data = utils.read_json(movies_file)
|
||||
if movies_data is None:
|
||||
print(f'read file error.')
|
||||
movies_data = []
|
||||
for movie in movies_data:
|
||||
# 修复url不规范的问题
|
||||
if movie['DistributorHref']:
|
||||
movie['DistributorHref'] = utils.dist_stu_href_rewrite(movie['DistributorHref'].lower())
|
||||
if movie['StudioHref']:
|
||||
movie['StudioHref'] = utils.dist_stu_href_rewrite(movie['StudioHref'].lower())
|
||||
movie_id = db_tools.insert_or_update_movie(movie)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: {movie['title']} url: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie {movie['title']}, {movie['href']} failed.')
|
||||
|
||||
logging.info('task completed.')
|
||||
|
||||
|
||||
# 主函数
|
||||
def main(task, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
if debug:
|
||||
logging.info('Debug mode enabled.')
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
if force:
|
||||
logging.info('force update for all data.')
|
||||
|
||||
if task == 'fetch':
|
||||
check_update()
|
||||
elif task == 'load':
|
||||
load_data()
|
||||
else:
|
||||
print(f'unkown command. see --help.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch iafd data.')
|
||||
parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.task, args.debug, args.force)
|
||||
main(args.cmd, args.debug, args.force)
|
||||
|
||||
@ -21,15 +21,14 @@ astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'S
|
||||
|
||||
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||
|
||||
ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
|
||||
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
|
||||
|
||||
distributors_list_url = f'{host_url}/distrib.asp'
|
||||
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||||
|
||||
studios_list_url = f"{host_url}/studio.asp"
|
||||
studios_base_url = f"{host_url}/studio.rme/studio="
|
||||
|
||||
ethnic_list_url = f'{host_url}/advsearch.asp'
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
@ -42,9 +41,15 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
try:
|
||||
if host_url not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None
|
||||
return None, None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.warning(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
@ -52,7 +57,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
@ -63,7 +68,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
@ -83,6 +88,31 @@ def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
def movie_validator(soup, table_id):
|
||||
return soup.find("table", id=table_id) is not None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_ethnic_list(soup, href):
|
||||
div_root = soup.find("select", id="ethnicity1")
|
||||
if not div_root:
|
||||
logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
|
||||
return None, None
|
||||
|
||||
list_data = []
|
||||
|
||||
# 提取所有的 <option> 标签
|
||||
options = div_root.find_all('option')
|
||||
if options:
|
||||
# 解析并输出 value 和文本内容
|
||||
for option in options:
|
||||
href = option.get('value', None)
|
||||
text = option.text.strip()
|
||||
if href and href.lower() == 'none':
|
||||
continue
|
||||
list_data.append({
|
||||
"name": text,
|
||||
"href": host_url + href if href else ''
|
||||
})
|
||||
return list_data
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_astro(soup, astro):
|
||||
astro_div = soup.find("div", id="astro")
|
||||
|
||||
107
scripts/iafd/src/load.py
Normal file
107
scripts/iafd/src/load.py
Normal file
@ -0,0 +1,107 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as db_tools
|
||||
import iafd_scraper as scraper
|
||||
import utils
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
res_dir = '/root/hostdir/scripts_data/iafd_202503'
|
||||
|
||||
# 演员列表
|
||||
def load_performer_list(file, **from_fields):
|
||||
json_data = utils.read_json(file)
|
||||
if json_data is None:
|
||||
json_data = []
|
||||
|
||||
total_rows = len(json_data)
|
||||
loaded_rows = 0
|
||||
succ = 0
|
||||
for row in json_data:
|
||||
row_id = db_tools.insert_performer_index(name=row.get('person', ''),
|
||||
href=row.get('href', ''),
|
||||
**from_fields
|
||||
)
|
||||
if row_id:
|
||||
logging.debug(f'insert one person, id: {row_id}, person: {row['person']}, url: {row['href']}')
|
||||
succ += 1
|
||||
else:
|
||||
logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
|
||||
loaded_rows += 1
|
||||
if loaded_rows % 10000 == 0:
|
||||
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
|
||||
|
||||
logging.info(f'load data succ. file: {file}, rows: {total_rows}, succ rows: {succ}')
|
||||
|
||||
# movie 列表
|
||||
def load_movie_list(file, **from_fields):
|
||||
json_data = utils.read_json(file)
|
||||
if json_data is None:
|
||||
json_data = []
|
||||
|
||||
total_rows = len(json_data)
|
||||
loaded_rows = 0
|
||||
succ = 0
|
||||
for row in json_data:
|
||||
row_id = db_tools.insert_movie_index(title=row.get('title', ''),
|
||||
href=row.get('href', ''),
|
||||
release_year=utils.to_number(row['year']),
|
||||
**from_fields
|
||||
)
|
||||
if row_id:
|
||||
logging.debug(f'insert one movie, id: {row_id}, title: {row['title']}, url: {row['href']}')
|
||||
succ += 1
|
||||
else:
|
||||
logging.warning(f'insert movie failed: {row['title']}, {row['href']} failed.')
|
||||
loaded_rows += 1
|
||||
if loaded_rows % 10000 == 0:
|
||||
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
|
||||
|
||||
logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
|
||||
|
||||
|
||||
# 演员详情
|
||||
def load_performers(file):
|
||||
json_data = utils.read_json(file)
|
||||
if json_data is None:
|
||||
json_data = []
|
||||
|
||||
total_rows = len(json_data)
|
||||
loaded_rows = 0
|
||||
succ = 0
|
||||
for row in json_data:
|
||||
performer_id = db_tools.insert_or_update_performer(row)
|
||||
if performer_id:
|
||||
logging.debug(f'insert one person, id: {performer_id}, person: {row['person']}, url: {row['href']}')
|
||||
succ += 1
|
||||
else:
|
||||
logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
|
||||
loaded_rows += 1
|
||||
if loaded_rows % 10000 == 0:
|
||||
logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
|
||||
|
||||
logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
load_performer_list(f'{res_dir}/astro.json', from_astro_list=1)
|
||||
time.sleep(3)
|
||||
load_performer_list(f'{res_dir}/birth.json', from_birth_list=1)
|
||||
time.sleep(3)
|
||||
load_performer_list(f'{res_dir}/ethnic.json', from_ethnic_list=1)
|
||||
time.sleep(3)
|
||||
|
||||
load_movie_list(f'{res_dir}/distributors.json', from_dist_list=1)
|
||||
time.sleep(3)
|
||||
load_movie_list(f'{res_dir}/studios.json', from_stu_list=1)
|
||||
time.sleep(3)
|
||||
|
||||
load_performers(f'{res_dir}/performers.json')
|
||||
|
||||
@ -25,20 +25,48 @@ def get_id_by_href(table: str, href: str) -> int:
|
||||
return row[0] if row else None
|
||||
|
||||
# 插入演员索引,来自于列表数据
|
||||
def insert_performer_index(name, href):
|
||||
def insert_performer_index(name, href, from_astro_list=None, from_birth_list=None, from_ethnic_list=None, from_movie_list=None):
|
||||
try:
|
||||
# **查询是否已存在该演员**
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO iafd_performers (href, name) VALUES (?, ?)
|
||||
""", (
|
||||
href, name
|
||||
))
|
||||
SELECT id, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list
|
||||
FROM iafd_performers WHERE href = ?
|
||||
""", (href,))
|
||||
existing_performer = cursor.fetchone()
|
||||
|
||||
if existing_performer: # **如果演员已存在**
|
||||
performer_id, existing_name, existing_astro, existing_birth, existing_ethnic, existing_movie = existing_performer
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
from_astro_list = from_astro_list if from_astro_list is not None else existing_astro
|
||||
from_birth_list = from_birth_list if from_birth_list is not None else existing_birth
|
||||
from_ethnic_list = from_ethnic_list if from_ethnic_list is not None else existing_ethnic
|
||||
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE iafd_performers
|
||||
SET name = ?,
|
||||
from_astro_list = ?,
|
||||
from_birth_list = ?,
|
||||
from_ethnic_list = ?,
|
||||
from_movie_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list, href))
|
||||
else: # **如果演员不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
performer_id = get_id_by_href('iafd_performers', href)
|
||||
if performer_id:
|
||||
logging.debug(f'insert one performer index, id: {performer_id}, name: {name}, href: {href}')
|
||||
logging.debug(f'Inserted/Updated performer index, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
return performer_id
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
@ -48,28 +76,58 @@ def insert_performer_index(name, href):
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
def insert_movie_index(title, href, release_year=0):
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
def insert_movie_index(title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
|
||||
try:
|
||||
# 插入或更新电影信息
|
||||
# **查询是否已存在该电影**
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO iafd_movies (title, href, release_year) VALUES (?, ?, ?)
|
||||
""",
|
||||
(title, href, release_year)
|
||||
)
|
||||
SELECT id, title, release_year, from_performer_list, from_dist_list, from_stu_list
|
||||
FROM iafd_movies WHERE href = ?
|
||||
""", (href,))
|
||||
existing_movie = cursor.fetchone()
|
||||
|
||||
if existing_movie: # **如果电影已存在**
|
||||
movie_id, existing_title, existing_year, existing_performer, existing_dist, existing_stu = existing_movie
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
release_year = release_year if release_year != 0 else existing_year
|
||||
from_performer_list = from_performer_list if from_performer_list is not None else existing_performer
|
||||
from_dist_list = from_dist_list if from_dist_list is not None else existing_dist
|
||||
from_stu_list = from_stu_list if from_stu_list is not None else existing_stu
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE iafd_movies
|
||||
SET title = ?,
|
||||
release_year = ?,
|
||||
from_performer_list = ?,
|
||||
from_dist_list = ?,
|
||||
from_stu_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (title, release_year, from_performer_list, from_dist_list, from_stu_list, href))
|
||||
else: # **如果电影不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_movies (title, href, release_year, from_performer_list, from_dist_list, from_stu_list)
|
||||
VALUES (?, ?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, release_year, from_performer_list, from_dist_list, from_stu_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
movie_id = get_id_by_href('iafd_movies', href)
|
||||
if movie_id:
|
||||
logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
|
||||
return movie_id
|
||||
return movie_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入演员和电影的关联数据
|
||||
def insert_performer_movie(performer_id, movie_id, role, notes):
|
||||
try:
|
||||
@ -167,16 +225,14 @@ def insert_or_update_performer(data):
|
||||
conn.commit()
|
||||
|
||||
# 插入影片列表,可能有 personal 和 director 两个身份
|
||||
credits = data['credits']
|
||||
if credits is None :
|
||||
return performer_id
|
||||
credits = data.get('credits', {})
|
||||
for role, movies in credits.items():
|
||||
if movies:
|
||||
for movie in movies:
|
||||
movie_id = get_id_by_href('iafd_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']))
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
|
||||
if movie_id:
|
||||
tmp_id = insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
||||
if tmp_id :
|
||||
@ -269,6 +325,54 @@ def query_performer_hrefs(**filters):
|
||||
return None
|
||||
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_ethnic(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_meta_ethnic (name, href)
|
||||
VALUES (?, ?)
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM iafd_meta_ethnic WHERE href = ?", (data["href"],))
|
||||
dist_id = cursor.fetchone()[0]
|
||||
if dist_id:
|
||||
logging.debug(f"成功插入/更新ethnic: {data['name']}")
|
||||
return dist_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_ethnic_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href, name FROM iafd_meta_ethnic WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "url" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_distributor(data):
|
||||
try:
|
||||
@ -436,7 +540,7 @@ def insert_or_update_movie(movie_data):
|
||||
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
|
||||
# 导演不存在的话,插入一条
|
||||
if director_id is None:
|
||||
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'])
|
||||
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'], from_movie_list=1)
|
||||
|
||||
# 插入或更新电影信息
|
||||
cursor.execute(
|
||||
@ -469,7 +573,7 @@ def insert_or_update_movie(movie_data):
|
||||
performer_id = get_id_by_href('iafd_performers', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = insert_performer_index(performer['name'], performer['href'])
|
||||
performer_id = insert_performer_index(performer['name'], performer['href'], from_movie_list=1)
|
||||
if performer_id:
|
||||
notes = '|'.join(tag for tag in performer['tags'] if tag != performer['name'])
|
||||
tmp_id = insert_performer_movie(performer_id, movie_id, 'personal', notes)
|
||||
@ -581,7 +685,13 @@ def insert_task_log():
|
||||
INSERT INTO iafd_task_log (task_status) VALUES ('Start')
|
||||
""")
|
||||
conn.commit()
|
||||
return cursor.lastrowid # 获取插入的 task_id
|
||||
|
||||
task_id = cursor.lastrowid
|
||||
if task_id is None:
|
||||
return None
|
||||
update_task_log(task_id=task_id, task_status='Start')
|
||||
|
||||
return task_id # 获取插入的 task_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"插入任务失败: {e}")
|
||||
return None
|
||||
|
||||
@ -20,26 +20,29 @@ def fetch_actor_list():
|
||||
next_url = scraper.actors_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row['href'] if row['href'] else '')
|
||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
|
||||
if actor_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取makers列表
|
||||
def fetch_makers_list():
|
||||
next_url = scraper.makers_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
@ -53,12 +56,16 @@ def fetch_makers_list():
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取series列表
|
||||
def fetch_series_list():
|
||||
next_url = scraper.series_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
@ -72,6 +79,10 @@ def fetch_series_list():
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_maker():
|
||||
@ -79,21 +90,27 @@ def fetch_movies_by_maker():
|
||||
if debug:
|
||||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||
for url in url_list:
|
||||
next_url = url
|
||||
while True:
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for maker url {next_url} ...")
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
@ -104,21 +121,26 @@ def fetch_movies_by_series():
|
||||
if debug:
|
||||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||
for url in url_list:
|
||||
next_url = url
|
||||
while True:
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for series url {next_url} ...")
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
@ -129,23 +151,31 @@ def fetch_performers_detail():
|
||||
perfomers_list = []
|
||||
while True:
|
||||
# 每次从数据库中取一部分,避免一次全量获取
|
||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=10)
|
||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
|
||||
if len(perfomers_list) < 1:
|
||||
logging.info(f'all performers fetched.')
|
||||
break
|
||||
for performer in perfomers_list:
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
pic = ''
|
||||
alias = []
|
||||
|
||||
next_url = url
|
||||
all_movies = []
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
if soup:
|
||||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||||
if data:
|
||||
all_movies.extend(data)
|
||||
pic = data.get('pic', '')
|
||||
alias = data.get('alias', [])
|
||||
all_movies.extend(data.get('movies', []))
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
|
||||
@ -153,8 +183,8 @@ def fetch_performers_detail():
|
||||
performer_id = db_tools.insert_or_update_actor({
|
||||
'href': url,
|
||||
'name': person,
|
||||
'pic' : '',
|
||||
'alias' : [],
|
||||
'pic' : pic,
|
||||
'alias' : alias,
|
||||
'credits':all_movies
|
||||
})
|
||||
if performer_id:
|
||||
@ -169,7 +199,7 @@ def fetch_performers_detail():
|
||||
def fetch_movies_detail():
|
||||
movies_list = []
|
||||
while True:
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=10)
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
|
||||
if len(movies_list) < 1:
|
||||
logging.info(f'all movies fetched.')
|
||||
break
|
||||
@ -177,7 +207,7 @@ def fetch_movies_detail():
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_movie_detail(soup, url, title)
|
||||
if movie_data :
|
||||
@ -188,84 +218,76 @@ def fetch_movies_detail():
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 获取更新
|
||||
def check_update():
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"actor_list": fetch_actor_list,
|
||||
"maker_list": fetch_makers_list,
|
||||
"series_list": fetch_series_list,
|
||||
"makers": fetch_movies_by_maker,
|
||||
"series" : fetch_movies_by_series,
|
||||
"movies" : fetch_movies_detail,
|
||||
"actors" : fetch_performers_detail,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
def main(cmd, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
|
||||
|
||||
if False:
|
||||
# 刷新演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching actor list')
|
||||
fetch_actor_list()
|
||||
|
||||
# 刷新makers列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching maker list')
|
||||
fetch_makers_list()
|
||||
|
||||
# 刷新series列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching series list')
|
||||
fetch_series_list()
|
||||
|
||||
# 刷新影片列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by maker')
|
||||
fetch_movies_by_maker()
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by series')
|
||||
fetch_movies_by_series()
|
||||
|
||||
# 更新演员信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching performers')
|
||||
fetch_performers_detail()
|
||||
|
||||
# 更新影片信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching movies')
|
||||
fetch_movies_detail()
|
||||
# 执行指定的函数
|
||||
if cmd:
|
||||
function_names = args.cmd.split(",") # 拆分输入
|
||||
for short_name in function_names:
|
||||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
else: # 全量执行
|
||||
for name, func in function_map.items():
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1,
|
||||
|
||||
# 处理本地数据
|
||||
def load_data():
|
||||
return True
|
||||
|
||||
# 主函数
|
||||
def main(task, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
if debug:
|
||||
logging.info('Debug mode enabled.')
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
if force:
|
||||
logging.info('force update for all data.')
|
||||
|
||||
if task == 'fetch':
|
||||
check_update()
|
||||
elif task == 'load':
|
||||
load_data()
|
||||
else:
|
||||
print(f'unkown command. see --help.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
parser = argparse.ArgumentParser(description='fetch iafd data.')
|
||||
parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.task, args.debug, args.force)
|
||||
main(args.cmd, args.debug, args.force)
|
||||
|
||||
@ -30,9 +30,15 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
try:
|
||||
if 'javdb.com' not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None
|
||||
return None, None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.warning(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
@ -40,7 +46,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
@ -51,7 +57,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
@ -78,6 +84,21 @@ def url_page_num(href):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
|
||||
def parse_avatar_image(soup):
|
||||
try:
|
||||
span = soup.find("span", class_="avatar")
|
||||
if not span:
|
||||
return "" # 没有找到 <span> 元素,返回空字符串
|
||||
|
||||
style = span.get("style", "")
|
||||
match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
|
||||
return match.group(1) if match else "" # 解析成功返回 URL,否则返回空字符串
|
||||
except Exception as e:
|
||||
return "" # 发生异常时,返回空字符串
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actors_uncensored(soup, href):
|
||||
div_actors = soup.find("div", id='actors')
|
||||
@ -123,6 +144,29 @@ def parse_actors_uncensored(soup, href):
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actor_detail(soup, href):
|
||||
# 先找一下别名
|
||||
alias_list = []
|
||||
|
||||
div_meta = soup.find('span', class_='actor-section-name')
|
||||
if not div_meta:
|
||||
logging.warning(f'warning: no meta data found in page {href}')
|
||||
return None, None
|
||||
alias_div = soup.find('div', class_='column section-title')
|
||||
|
||||
if alias_div:
|
||||
meta_list = alias_div.find_all('span', class_='section-meta')
|
||||
if len(meta_list) > 1:
|
||||
alias_list = meta_list[0].text.strip().split(", ")
|
||||
|
||||
# 头像
|
||||
pic = ''
|
||||
avatar = soup.find("div", class_="column actor-avatar")
|
||||
if avatar:
|
||||
pic = parse_avatar_image(avatar)
|
||||
|
||||
# 返回数据
|
||||
actor = {}
|
||||
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
@ -157,7 +201,13 @@ def parse_actor_detail(soup, href):
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
actor = {
|
||||
'pic' : pic,
|
||||
'alias' : alias_list,
|
||||
'movies' : list_data
|
||||
}
|
||||
|
||||
return actor, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
@ -257,7 +307,7 @@ def parse_series_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
@ -337,7 +387,7 @@ def parse_maker_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
@ -17,21 +17,42 @@ def get_id_by_href(table: str, href: str) -> int:
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
# 插入演员索引,来自于列表数据
|
||||
def insert_actor_index(name, href):
|
||||
|
||||
def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO javdb_actors (href, name) VALUES (?, ?)
|
||||
""", (
|
||||
href, name
|
||||
))
|
||||
# **查询是否已存在该演员**
|
||||
cursor.execute("SELECT id, name, from_actor_list, from_movie_list FROM javdb_actors WHERE href = ?", (href,))
|
||||
existing_actor = cursor.fetchone()
|
||||
|
||||
if existing_actor: # **如果演员已存在**
|
||||
actor_id, existing_name, existing_actor_list, existing_movie_list = existing_actor
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor_list
|
||||
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie_list
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_actors
|
||||
SET name = ?,
|
||||
from_actor_list = ?,
|
||||
from_movie_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (name, from_actor_list, from_movie_list, href))
|
||||
else: # **如果演员不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_actors (href, name, from_actor_list, from_movie_list)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (href, name, from_actor_list, from_movie_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
performer_id = get_id_by_href('javdb_actors', href)
|
||||
if performer_id:
|
||||
logging.debug(f'insert one actor index, id: {performer_id}, name: {name}, href: {href}')
|
||||
logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
return performer_id
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
@ -41,28 +62,49 @@ def insert_actor_index(name, href):
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
def insert_movie_index(title, href):
|
||||
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None):
|
||||
try:
|
||||
# 插入或更新电影信息
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO javdb_movies (title, href) VALUES (?, ?)
|
||||
""",
|
||||
(title, href)
|
||||
)
|
||||
# **先检查数据库中是否已有该电影**
|
||||
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series FROM javdb_movies WHERE href = ?", (href,))
|
||||
existing_movie = cursor.fetchone()
|
||||
|
||||
if existing_movie: # **如果电影已存在**
|
||||
movie_id, existing_actor, existing_maker, existing_series = existing_movie
|
||||
|
||||
# **如果没有传入值,就用原来的值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
|
||||
from_movie_makers = from_movie_makers if from_movie_makers is not None else existing_maker
|
||||
from_movie_series = from_movie_series if from_movie_series is not None else existing_series
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_movies
|
||||
SET title = ?,
|
||||
from_actor_list = ?,
|
||||
from_movie_makers = ?,
|
||||
from_movie_series = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (title, from_actor_list, from_movie_makers, from_movie_series, href))
|
||||
else: # **如果电影不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, from_actor_list, from_movie_makers, from_movie_series))
|
||||
|
||||
conn.commit()
|
||||
|
||||
movie_id = get_id_by_href('javdb_movies', href)
|
||||
if movie_id:
|
||||
logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
|
||||
return movie_id
|
||||
|
||||
return movie_id
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
logging.error(f"Error inserting/updating movie: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
# 插入演员和电影的关联数据
|
||||
def insert_actor_movie(performer_id, movie_id, tags=''):
|
||||
try:
|
||||
@ -117,7 +159,7 @@ def insert_or_update_actor(actor):
|
||||
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'])
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
|
||||
if movie_id:
|
||||
tmp_id = insert_actor_movie(actor_id, movie_id)
|
||||
if tmp_id :
|
||||
@ -369,7 +411,7 @@ def insert_or_update_movie(movie):
|
||||
performer_id = get_id_by_href('javdb_actors', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = insert_actor_index(performer['name'], performer['href'])
|
||||
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
|
||||
if performer_id:
|
||||
tmp_id = insert_actor_movie(performer_id, movie_id)
|
||||
if tmp_id:
|
||||
@ -465,7 +507,13 @@ def insert_task_log():
|
||||
INSERT INTO javdb_task_log (task_status) VALUES ('Start')
|
||||
""")
|
||||
conn.commit()
|
||||
return cursor.lastrowid # 获取插入的 task_id
|
||||
|
||||
task_id = cursor.lastrowid
|
||||
if task_id is None:
|
||||
return None
|
||||
update_task_log(task_id=task_id, task_status='Start')
|
||||
|
||||
return task_id # 获取插入的 task_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"插入任务失败: {e}")
|
||||
return None
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
|
||||
|
||||
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
||||
def remove_url_query(url: str) -> str:
|
||||
try:
|
||||
parsed_url = urlparse(url)
|
||||
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
return clean_url
|
||||
except Exception as e:
|
||||
print(f"解析 URL 失败: {e}")
|
||||
return url
|
||||
|
||||
@ -93,7 +93,7 @@ def load_json(file_path):
|
||||
return []
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_table()
|
||||
#create_table()
|
||||
json_data = load_json("./result/models_detail.json")
|
||||
if json_data:
|
||||
insert_data(json_data)
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
CREATE TABLE sqlite_sequence(name,seq);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_performers" (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
@ -27,8 +26,15 @@ CREATE TABLE IF NOT EXISTS "iafd_performers" (
|
||||
vixen_cnt INTEGER,
|
||||
blacked_cnt INTEGER,
|
||||
tushy_cnt INTEGER,
|
||||
x_art_cnt INTEGER
|
||||
);
|
||||
x_art_cnt INTEGER,
|
||||
is_full_data INTEGER DEFAULT (0) NOT NULL,
|
||||
birth_year INTEGER DEFAULT (0) NOT NULL,
|
||||
from_astro_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_birth_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_ethnic_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_movie_list INTEGER DEFAULT (0) NOT NULL
|
||||
);
|
||||
CREATE TABLE sqlite_sequence(name,seq);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_performer_aliases" (
|
||||
`performer_id` integer NOT NULL,
|
||||
`alias` varchar(255) NOT NULL,
|
||||
@ -44,59 +50,6 @@ CREATE TABLE IF NOT EXISTS "iafd_movies_appers_in" (
|
||||
foreign key(`appears_in_id`) references "iafd_movies"(`id`) on delete CASCADE,
|
||||
PRIMARY KEY (`movie_id`, `appears_in_id`)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_performer_urls" (
|
||||
`performer_id` integer NOT NULL,
|
||||
`position` varchar(255) NOT NULL,
|
||||
`url` varchar(255) NOT NULL,
|
||||
foreign key(`performer_id`) references "iafd_performers"(`id`) on delete CASCADE,
|
||||
PRIMARY KEY(`performer_id`, `position`, `url`)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_distributors" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`parent_id` INTEGER DEFAULT NULL CHECK (`id` IS NOT `parent_id`) REFERENCES "iafd_distributors"(`id`) ON DELETE SET NULL,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`details` TEXT
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_studios" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`parent_id` INTEGER DEFAULT NULL CHECK (`id` IS NOT `parent_id`) REFERENCES "iafd_studios"(`id`) ON DELETE SET NULL,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`details` TEXT
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_performers_movies" (
|
||||
`performer_id` integer,
|
||||
`movie_id` integer,
|
||||
`role` varchar(255),
|
||||
`notes` varchar(255),
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
foreign key(`performer_id`) references "iafd_performers"(`id`) on delete CASCADE,
|
||||
foreign key(`movie_id`) references "iafd_movies"(`id`) on delete CASCADE,
|
||||
PRIMARY KEY (`movie_id`, `performer_id`)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_task_log" (
|
||||
`task_id` integer not null primary key autoincrement,
|
||||
`before_performers` integer,
|
||||
`new_performers` integer,
|
||||
`after_performers` integer,
|
||||
`before_movies` integer,
|
||||
`new_movies` integer,
|
||||
`after_movies` integer,
|
||||
`before_distributors` integer,
|
||||
`new_distributors` integer,
|
||||
`after_distributors` integer,
|
||||
`before_studios` integer,
|
||||
`new_studios` integer,
|
||||
`after_studios` integer,
|
||||
`task_status` varchar(255),
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_movies" (
|
||||
`id` integer not null primary key autoincrement,
|
||||
`title` varchar(255),
|
||||
@ -112,10 +65,68 @@ CREATE TABLE IF NOT EXISTS "iafd_movies" (
|
||||
`director_id` integer,
|
||||
`href` varchar(255) UNIQUE,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
is_full_data INTEGER DEFAULT (0) NOT NULL,
|
||||
release_year INTEGER DEFAULT (0) NOT NULL,
|
||||
from_performer_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_dist_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_stu_list INTEGER DEFAULT (0) NOT NULL,
|
||||
foreign key(`studio_id`) references "iafd_studios"(`id`) on delete SET NULL,
|
||||
foreign key(`distributor_id`) references "iafd_distributors"(`id`) on delete SET NULL
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_performers_movies" (
|
||||
`performer_id` integer,
|
||||
`movie_id` integer,
|
||||
`role` varchar(255),
|
||||
`notes` varchar(255),
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
foreign key(`performer_id`) references "iafd_performers"(`id`) on delete CASCADE,
|
||||
foreign key(`movie_id`) references "iafd_movies"(`id`) on delete CASCADE,
|
||||
PRIMARY KEY (`movie_id`, `performer_id`)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_studios" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`parent_id` INTEGER DEFAULT NULL CHECK (`id` IS NOT `parent_id`) REFERENCES "iafd_studios"(`id`) ON DELETE SET NULL,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`details` TEXT
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_distributors" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`parent_id` INTEGER DEFAULT NULL CHECK (`id` IS NOT `parent_id`) REFERENCES "iafd_distributors"(`id`) ON DELETE SET NULL,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`details` TEXT
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_performer_urls" (
|
||||
`performer_id` integer NOT NULL,
|
||||
`position` varchar(255) NOT NULL,
|
||||
`url` varchar(255) NOT NULL,
|
||||
foreign key(`performer_id`) references "iafd_performers"(`id`) on delete CASCADE,
|
||||
PRIMARY KEY(`performer_id`, `position`, `url`)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_task_log" (
|
||||
`task_id` integer not null primary key autoincrement,
|
||||
`full_data_performers` integer,
|
||||
`total_performers` integer,
|
||||
`full_data_movies` integer,
|
||||
`total_movies` integer,
|
||||
`total_distributors` integer,
|
||||
`total_studios` integer,
|
||||
`task_status` varchar(255),
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "iafd_meta_ethnic" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
);
|
||||
CREATE TABLE javhd_models (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
rank INTEGER,
|
||||
@ -170,3 +181,78 @@ CREATE TABLE thelordofporn_alias (
|
||||
FOREIGN KEY (actress_id) REFERENCES thelordofporn_actress(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY(`actress_id`, `alias`)
|
||||
);
|
||||
CREATE TABLE javdb_actors (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
href TEXT UNIQUE NOT NULL,
|
||||
pic TEXT,
|
||||
created_at DATETIME DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at DATETIME DEFAULT (datetime('now', 'localtime')),
|
||||
is_full_data INTEGER DEFAULT (0) NOT NULL,
|
||||
from_actor_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_movie_list INTEGER DEFAULT (0) NOT NULL
|
||||
);
|
||||
CREATE TABLE javdb_actors_alias (
|
||||
actor_id INTEGER NOT NULL,
|
||||
alias TEXT NOT NULL,
|
||||
created_at DATETIME DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at DATETIME DEFAULT (datetime('now', 'localtime')),
|
||||
PRIMARY KEY (actor_id, alias),
|
||||
FOREIGN KEY (actor_id) REFERENCES javdb_actors(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "javdb_makers" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`parent_id` INTEGER DEFAULT NULL CHECK (`id` IS NOT `parent_id`) REFERENCES "javdb_makers"(`id`) ON DELETE SET NULL,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`details` TEXT
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "javdb_series" (
|
||||
`id` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
||||
`name` VARCHAR(255) NOT NULL,
|
||||
`href` VARCHAR(255) UNIQUE,
|
||||
`parent_id` INTEGER DEFAULT NULL CHECK (`id` IS NOT `parent_id`) REFERENCES "javdb_series"(`id`) ON DELETE SET NULL,
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`details` TEXT
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "javdb_movies" (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
href TEXT UNIQUE,
|
||||
title TEXT,
|
||||
cover_url TEXT,
|
||||
serial_number TEXT,
|
||||
release_date TEXT,
|
||||
duration TEXT,
|
||||
maker_id TEXT,
|
||||
series_id TEXT,
|
||||
is_full_data INTEGER DEFAULT (0) NOT NULL,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
from_actor_list INTEGER DEFAULT (0) NOT NULL,
|
||||
from_movie_makers INTEGER DEFAULT (0) NOT NULL,
|
||||
from_movie_series INTEGER DEFAULT (0) NOT NULL
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "javdb_actors_movies" (
|
||||
actor_id INTEGER,
|
||||
movie_id INTEGER,
|
||||
tags TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
PRIMARY KEY (actor_id, movie_id),
|
||||
FOREIGN KEY (actor_id) REFERENCES javdb_actors(id),
|
||||
FOREIGN KEY (movie_id) REFERENCES "javdb_movies"(id)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS "javdb_task_log" (
|
||||
`task_id` integer not null primary key autoincrement,
|
||||
`full_data_actors` integer,
|
||||
`total_actors` integer,
|
||||
`full_data_movies` integer,
|
||||
`total_movies` integer,
|
||||
`total_makers` integer,
|
||||
`total_series` integer,
|
||||
`task_status` varchar(255),
|
||||
`created_at` TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
`updated_at` TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
);
|
||||
|
||||
@ -147,7 +147,7 @@ def insert_actress(conn, actress):
|
||||
def main():
|
||||
setup_logging()
|
||||
conn = connect_db()
|
||||
create_tables(conn)
|
||||
#create_tables(conn)
|
||||
actresses = load_json("./result/actress_detail.json")
|
||||
|
||||
if actresses:
|
||||
|
||||
Reference in New Issue
Block a user