modify some scripts.
This commit is contained in:
@ -16,9 +16,7 @@ debug = False
|
|||||||
force = False
|
force = False
|
||||||
|
|
||||||
# 按星座获取演员列表,无翻页
|
# 按星座获取演员列表,无翻页
|
||||||
def fetch_performers_by_astro(existed_performer_hrefs):
|
def fetch_performers_by_astro():
|
||||||
performers = []
|
|
||||||
|
|
||||||
for astro in scraper.astro_list:
|
for astro in scraper.astro_list:
|
||||||
url = scraper.astr_base_url + astro
|
url = scraper.astr_base_url + astro
|
||||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||||
@ -28,11 +26,13 @@ def fetch_performers_by_astro(existed_performer_hrefs):
|
|||||||
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||||
if list_data:
|
if list_data:
|
||||||
for row in list_data :
|
for row in list_data :
|
||||||
if row['href'] not in existed_performer_hrefs:
|
# 写入演员数据表
|
||||||
performers.append({
|
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||||
'person' : row['person'],
|
if perfomer_id:
|
||||||
'href' : row['href'].lower() if row['href'] else ''
|
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||||
})
|
else:
|
||||||
|
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
else:
|
else:
|
||||||
@ -41,13 +41,10 @@ def fetch_performers_by_astro(existed_performer_hrefs):
|
|||||||
# 调试添加break
|
# 调试添加break
|
||||||
if debug:
|
if debug:
|
||||||
break
|
break
|
||||||
return performers
|
|
||||||
|
|
||||||
|
|
||||||
# 按生日获取演员列表,无翻页
|
# 按生日获取演员列表,无翻页
|
||||||
def fetch_performers_by_birth(existed_performer_hrefs):
|
def fetch_performers_by_birth():
|
||||||
performers = []
|
|
||||||
|
|
||||||
for month in range(1, 13): # 遍历1到12月
|
for month in range(1, 13): # 遍历1到12月
|
||||||
for day in range(1, 32): # 遍历1到31天
|
for day in range(1, 32): # 遍历1到31天
|
||||||
url = scraper.birth_base_url.format(month=month, day=day)
|
url = scraper.birth_base_url.format(month=month, day=day)
|
||||||
@ -57,11 +54,12 @@ def fetch_performers_by_birth(existed_performer_hrefs):
|
|||||||
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||||
if list_data:
|
if list_data:
|
||||||
for row in list_data :
|
for row in list_data :
|
||||||
if row['href'] not in existed_performer_hrefs:
|
# 写入演员数据表
|
||||||
performers.append({
|
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||||
'person' : row['person'],
|
if perfomer_id:
|
||||||
'href' : row['href'].lower() if row['href'] else ''
|
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||||
})
|
else:
|
||||||
|
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
else:
|
else:
|
||||||
@ -69,18 +67,14 @@ def fetch_performers_by_birth(existed_performer_hrefs):
|
|||||||
|
|
||||||
# 调试添加break
|
# 调试添加break
|
||||||
if debug:
|
if debug:
|
||||||
return performers
|
return True
|
||||||
|
|
||||||
return performers
|
|
||||||
|
|
||||||
# 处理带空格的种族名
|
# 处理带空格的种族名
|
||||||
def format_ethnic(ethnic):
|
def format_ethnic(ethnic):
|
||||||
return ethnic.replace(' ', '+')
|
return ethnic.replace(' ', '+')
|
||||||
|
|
||||||
# 按人种获取演员列表,有翻页
|
# 按人种获取演员列表,有翻页
|
||||||
def fetch_performers_by_ethnic(existed_performer_hrefs):
|
def fetch_performers_by_ethnic():
|
||||||
performers = []
|
|
||||||
|
|
||||||
for ethnic in scraper.ethnic_list:
|
for ethnic in scraper.ethnic_list:
|
||||||
url = scraper.ethnic_url + format_ethnic(ethnic)
|
url = scraper.ethnic_url + format_ethnic(ethnic)
|
||||||
next_url = url
|
next_url = url
|
||||||
@ -93,11 +87,12 @@ def fetch_performers_by_ethnic(existed_performer_hrefs):
|
|||||||
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||||
if list_data:
|
if list_data:
|
||||||
for row in list_data :
|
for row in list_data :
|
||||||
if row['href'] not in existed_performer_hrefs:
|
# 写入演员数据表
|
||||||
performers.append({
|
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||||
'person' : row['person'],
|
if perfomer_id:
|
||||||
'href' : row['href'].lower() if row['href'] else ''
|
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||||
})
|
else:
|
||||||
|
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
else:
|
else:
|
||||||
@ -105,15 +100,11 @@ def fetch_performers_by_ethnic(existed_performer_hrefs):
|
|||||||
|
|
||||||
# 调试添加break
|
# 调试添加break
|
||||||
if debug:
|
if debug:
|
||||||
return performers
|
return True
|
||||||
return performers
|
|
||||||
|
|
||||||
|
|
||||||
# 获取distributors列表
|
# 获取distributors列表
|
||||||
def fetch_distributors_list(existed_distributors_href):
|
def fetch_distributors_list():
|
||||||
url = scraper.distributors_list_url
|
url = scraper.distributors_list_url
|
||||||
distributors_list = []
|
|
||||||
|
|
||||||
logging.info(f"Fetching data for distributors list, url {url} ...")
|
logging.info(f"Fetching data for distributors list, url {url} ...")
|
||||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||||
if soup:
|
if soup:
|
||||||
@ -121,23 +112,17 @@ def fetch_distributors_list(existed_distributors_href):
|
|||||||
if list_data:
|
if list_data:
|
||||||
for row in list_data :
|
for row in list_data :
|
||||||
dis_url = scraper.distributors_base_url + row['href']
|
dis_url = scraper.distributors_base_url + row['href']
|
||||||
if dis_url in existed_distributors_href :
|
dist_id = db_tools.insert_or_update_distributor({'name': row['name'], 'href': dis_url})
|
||||||
continue
|
if dist_id:
|
||||||
distributors_list.append({
|
logging.debug(f'insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}')
|
||||||
'name' : row['name'],
|
|
||||||
'href' : dis_url.lower() if dis_url else ''
|
|
||||||
})
|
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
return distributors_list
|
|
||||||
|
|
||||||
# 获取studios列表
|
# 获取studios列表
|
||||||
def fetch_studios_list(existed_studios_href):
|
def fetch_studios_list():
|
||||||
url = scraper.studios_list_url
|
url = scraper.studios_list_url
|
||||||
studios_list = []
|
|
||||||
|
|
||||||
logging.info(f"Fetching data for studios list, url {url} ...")
|
logging.info(f"Fetching data for studios list, url {url} ...")
|
||||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||||
if soup:
|
if soup:
|
||||||
@ -145,205 +130,189 @@ def fetch_studios_list(existed_studios_href):
|
|||||||
if list_data:
|
if list_data:
|
||||||
for row in list_data :
|
for row in list_data :
|
||||||
stu_url = scraper.studios_base_url + row['href']
|
stu_url = scraper.studios_base_url + row['href']
|
||||||
if stu_url in existed_studios_href:
|
stu_id = db_tools.insert_or_update_studio({'name': row['name'], 'href': stu_url})
|
||||||
continue
|
if stu_id:
|
||||||
studios_list.append({
|
logging.debug(f'insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}')
|
||||||
'name' : row['name'],
|
|
||||||
'href' : stu_url.lower() if stu_url else ''
|
|
||||||
})
|
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
return studios_list
|
|
||||||
|
|
||||||
|
# 更新distributors列表中的影片信息
|
||||||
|
def fetch_movies_by_dist():
|
||||||
|
url_list = db_tools.query_studio_hrefs()
|
||||||
|
if debug:
|
||||||
|
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
|
||||||
|
for url in url_list:
|
||||||
|
logging.info(f"Fetching data for distributor url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||||
|
if list_data:
|
||||||
|
for movie in list_data:
|
||||||
|
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']))
|
||||||
|
if tmp_id:
|
||||||
|
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
# 调试增加brak
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 更新distributors列表中的影片信息
|
||||||
|
def fetch_movies_by_stu():
|
||||||
|
url_list = db_tools.query_studio_hrefs()
|
||||||
|
if debug:
|
||||||
|
url_list = db_tools.query_studio_hrefs(name='vixen.com')
|
||||||
|
for url in url_list:
|
||||||
|
logging.info(f"Fetching data for studio url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||||
|
if list_data:
|
||||||
|
for movie in list_data:
|
||||||
|
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']))
|
||||||
|
if tmp_id:
|
||||||
|
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
# 调试增加brak
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 更新演员信息
|
||||||
|
def fetch_performers_detail():
|
||||||
|
perfomers_list = []
|
||||||
|
while True:
|
||||||
|
# 每次从数据库中取一部分,避免一次全量获取
|
||||||
|
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=1000)
|
||||||
|
if len(perfomers_list) < 1:
|
||||||
|
logging.info(f'all performers fetched.')
|
||||||
|
break
|
||||||
|
for performer in perfomers_list:
|
||||||
|
url = performer['href']
|
||||||
|
person = performer['name']
|
||||||
|
logging.info(f"Fetching data for performer ({person}), url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
data = scraper.parse_page_performer(soup)
|
||||||
|
if data:
|
||||||
|
performer_id = db_tools.insert_or_update_performer({
|
||||||
|
'href': url,
|
||||||
|
'person': person,
|
||||||
|
**data
|
||||||
|
})
|
||||||
|
if performer_id:
|
||||||
|
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||||
|
|
||||||
|
# 写入到本地json文件
|
||||||
|
utils.write_person_json(person, url, {
|
||||||
|
'href': url,
|
||||||
|
'person': person,
|
||||||
|
**data
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||||
|
# 调试break
|
||||||
|
if debug:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 更新影片信息
|
||||||
|
def fetch_movies_detail():
|
||||||
|
movies_list = []
|
||||||
|
while True:
|
||||||
|
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=1000)
|
||||||
|
if len(movies_list) < 1:
|
||||||
|
logging.info(f'all movies fetched.')
|
||||||
|
break
|
||||||
|
for movie in movies_list:
|
||||||
|
url = movie['href']
|
||||||
|
title = movie['title']
|
||||||
|
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||||
|
if movie_data :
|
||||||
|
# 修复url不规范的问题
|
||||||
|
if movie_data['DistributorHref']:
|
||||||
|
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
|
||||||
|
if movie_data['StudioHref']:
|
||||||
|
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
|
||||||
|
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||||
|
if movie_id:
|
||||||
|
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie {url} failed.')
|
||||||
|
|
||||||
|
# 写入到本地json文件
|
||||||
|
utils.write_movie_json(url, movie_data)
|
||||||
|
else:
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
|
# 调试增加break
|
||||||
|
if debug:
|
||||||
|
return True
|
||||||
|
|
||||||
# 获取更新
|
# 获取更新
|
||||||
def check_update():
|
def check_update():
|
||||||
# 读取数据库中的演员列表
|
|
||||||
existed_performer_hrefs = db_tools.query_performer_hrefs()
|
|
||||||
if not existed_performer_hrefs:
|
|
||||||
logging.warning(f'get existed performers from db error.')
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 开启任务
|
# 开启任务
|
||||||
task_id = db_tools.insert_task_log()
|
task_id = db_tools.insert_task_log()
|
||||||
if task_id is None:
|
if task_id is None:
|
||||||
logging.warning(f'insert task log error.')
|
logging.warning(f'insert task log error.')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 从列表页获取新的演员
|
# 刷新星座演员列表
|
||||||
new_performers = []
|
db_tools.update_task_log(task_id, task_status='fetching astro list')
|
||||||
if not debug : # 数据量较大,debug 模式下跳过
|
fetch_performers_by_astro()
|
||||||
new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
|
|
||||||
new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
|
|
||||||
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
|
|
||||||
|
|
||||||
# 逐个获取演员信息,并写入到db中
|
# 刷新生日演员列表
|
||||||
new_performers = list({item["href"]: item for item in new_performers}.values())
|
db_tools.update_task_log(task_id, task_status='fetching birth list')
|
||||||
logging.info(f'get new performers count: {len(new_performers)} ')
|
fetch_performers_by_birth()
|
||||||
db_tools.update_task_log(task_id, before_performers=len(existed_performer_hrefs), new_performers=len(new_performers), task_status='Inserting new performers')
|
|
||||||
for performer in new_performers:
|
|
||||||
url = performer['href']
|
|
||||||
person = performer['person']
|
|
||||||
logging.info(f"Fetching data for performer {person}, url {url} ...")
|
|
||||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
|
||||||
if soup:
|
|
||||||
data, credits = scraper.parse_page_performer(soup)
|
|
||||||
if data:
|
|
||||||
performer_id = db_tools.insert_or_update_performer({
|
|
||||||
'href': url,
|
|
||||||
'person': person,
|
|
||||||
**data
|
|
||||||
})
|
|
||||||
if performer_id:
|
|
||||||
logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'insert person: {person} {url} failed.')
|
|
||||||
|
|
||||||
# 写入到本地json文件
|
# 刷新人种演员列表
|
||||||
utils.write_person_json(person, url, {
|
db_tools.update_task_log(task_id, task_status='fetching ethnic list')
|
||||||
'href': url,
|
fetch_performers_by_ethnic()
|
||||||
'person': person,
|
|
||||||
**data,
|
|
||||||
'credits': credits if credits else {}
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'fetch_page error. person: {person}, url: {url}')
|
|
||||||
# 调试break
|
|
||||||
if debug:
|
|
||||||
break
|
|
||||||
|
|
||||||
# 从数据库读取distributors列表
|
# 刷新distributors列表
|
||||||
existed_distributors_href = db_tools.query_distributor_hrefs()
|
db_tools.update_task_log(task_id, task_status='fetching distributor list')
|
||||||
if existed_distributors_href is None:
|
fetch_distributors_list()
|
||||||
logging.warning(f'get existed distributors from db error.')
|
|
||||||
return
|
|
||||||
new_distributors = fetch_distributors_list(existed_distributors_href)
|
|
||||||
db_tools.update_task_log(task_id, before_distributors=len(existed_distributors_href), new_distributors=len(new_distributors), task_status='Inserting new distributors')
|
|
||||||
for dist in new_distributors:
|
|
||||||
dist_id = db_tools.insert_or_update_distributor(dist)
|
|
||||||
if dist_id:
|
|
||||||
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
|
|
||||||
|
|
||||||
# 从数据库读取studios列表
|
|
||||||
existed_studios_href = db_tools.query_studio_hrefs()
|
|
||||||
if existed_studios_href is None:
|
|
||||||
logging.warning(f'get existed studios from db error.')
|
|
||||||
return
|
|
||||||
new_studios = fetch_studios_list(existed_studios_href)
|
|
||||||
db_tools.update_task_log(task_id, before_studios=len(existed_studios_href), new_studios=len(new_studios), task_status='Inserting new studios')
|
|
||||||
for stu in new_studios:
|
|
||||||
stu_id = db_tools.insert_or_update_studio(stu)
|
|
||||||
if stu_id:
|
|
||||||
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
|
|
||||||
|
|
||||||
# 从数据库中读取影片列表
|
|
||||||
existed_movies = db_tools.query_movie_hrefs()
|
|
||||||
if existed_movies is None:
|
|
||||||
logging.warning(f'load movies from db error')
|
|
||||||
return
|
|
||||||
new_movies = []
|
|
||||||
new_movie_hrefs = []
|
|
||||||
|
|
||||||
# 遍历所有 distributors,获取 movies 列表
|
# 刷新studios列表
|
||||||
existed_distributors_href = db_tools.query_distributor_hrefs(name='vixen')
|
db_tools.update_task_log(task_id, task_status='fetching studio list')
|
||||||
if existed_distributors_href is None:
|
fetch_studios_list()
|
||||||
logging.warning(f'get existed distributors from db error.')
|
|
||||||
return
|
|
||||||
for url in existed_distributors_href:
|
|
||||||
logging.info(f"Fetching data for distributor url {url} ...")
|
|
||||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
|
||||||
if soup:
|
|
||||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
|
||||||
if list_data:
|
|
||||||
for movie in list_data:
|
|
||||||
if movie['href'] in existed_movies:
|
|
||||||
continue
|
|
||||||
new_movies.append({
|
|
||||||
'title' : movie['title'],
|
|
||||||
'href' : movie['href']
|
|
||||||
})
|
|
||||||
new_movie_hrefs.append(movie['href'])
|
|
||||||
else :
|
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
|
||||||
# 调试增加brak
|
|
||||||
if debug:
|
|
||||||
break
|
|
||||||
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
|
|
||||||
|
|
||||||
# 遍历所有 studios,获取 movies 列表
|
|
||||||
existed_studios_href = db_tools.query_studio_hrefs(name='vixen')
|
|
||||||
if existed_studios_href is None:
|
|
||||||
logging.warning(f'get existed studios from db error.')
|
|
||||||
return
|
|
||||||
for url in existed_studios_href:
|
|
||||||
logging.info(f"Fetching data for studio url {url} ...")
|
|
||||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
|
||||||
if soup:
|
|
||||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
|
||||||
if list_data:
|
|
||||||
for movie in list_data:
|
|
||||||
if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
|
|
||||||
continue
|
|
||||||
new_movies.append({
|
|
||||||
'title' : movie['title'],
|
|
||||||
'href' : movie['href']
|
|
||||||
})
|
|
||||||
new_movie_hrefs.append(movie['href'])
|
|
||||||
else :
|
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
|
||||||
# 调试增加brak
|
|
||||||
if debug:
|
|
||||||
break
|
|
||||||
logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
|
|
||||||
|
|
||||||
# 对新的影片,逐个获取内容
|
# 刷新影片列表
|
||||||
new_movies = list({item["href"]: item for item in new_movies}.values())
|
db_tools.update_task_log(task_id, task_status='fetching movie list by dist')
|
||||||
logging.info(f'get merged new movies, count: {len(new_movies)} ')
|
fetch_movies_by_dist()
|
||||||
db_tools.update_task_log(task_id, before_movies=len(existed_movies), new_movies=len(new_movies), task_status='Inserting new movies')
|
db_tools.update_task_log(task_id, task_status='fetching movie list by stu')
|
||||||
for movie in new_movies:
|
fetch_movies_by_stu()
|
||||||
url = movie['href']
|
|
||||||
title = movie['title']
|
|
||||||
logging.info(f"Fetching data for movie {title}, url {url} ...")
|
|
||||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
|
||||||
if soup:
|
|
||||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
|
||||||
if movie_data :
|
|
||||||
# 修复url不规范的问题
|
|
||||||
if movie_data['DistributorHref']:
|
|
||||||
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
|
|
||||||
if movie_data['StudioHref']:
|
|
||||||
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
|
|
||||||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
|
||||||
if movie_id:
|
|
||||||
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'insert movie {url} failed.')
|
|
||||||
|
|
||||||
# 写入到本地json文件
|
# 更新演员信息
|
||||||
utils.write_movie_json(url, movie_data)
|
db_tools.update_task_log(task_id, task_status='fetching performers')
|
||||||
else:
|
fetch_performers_detail()
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'fetch_page error. url: {url}')
|
|
||||||
# 调试增加break
|
|
||||||
if debug:
|
|
||||||
break
|
|
||||||
|
|
||||||
# TODO:
|
# 更新影片信息
|
||||||
# 1, appearsIn 因为影片入库的先后顺序不可控,会出现无法插入 movies_appers_in 表的情况,应该要先记录下待处理的movie,所有记录插入完成后再做处理
|
db_tools.update_task_log(task_id, task_status='fetching movies')
|
||||||
# 2, movie 的更新,涉及到performers的几个统计字段的更新,应该要找到本次tasklog启动后插入到 performers_movies 表里的所有performers,刷新其统计数据;也可以简单粗暴的全量更新
|
fetch_movies_detail()
|
||||||
# 3, 目前performers_movies以movies爬取的信息为主来更新,perfomers爬取的信息应该可以作为检验,尤其是perfomers页面有notes字段
|
|
||||||
|
|
||||||
logging.info(f'all process completed!')
|
logging.info(f'all process completed!')
|
||||||
db_tools.finalize_task_log(task_id)
|
db_tools.finalize_task_log(task_id)
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# 1, movies 更新之后,要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取
|
||||||
|
# 2, distributors 和 studios 对movie列表的互相检验
|
||||||
|
# 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的
|
||||||
|
|
||||||
# 处理本地数据
|
# 处理本地数据
|
||||||
def load_data():
|
def load_data():
|
||||||
|
|||||||
@ -40,6 +40,10 @@ scraper = cloudscraper.create_scraper()
|
|||||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
|
if host_url not in url.lower():
|
||||||
|
logging.error(f'wrong url format: {url}')
|
||||||
|
return None
|
||||||
|
|
||||||
response = scraper.get(url, headers=headers)
|
response = scraper.get(url, headers=headers)
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
@ -267,6 +271,8 @@ def parse_credits_table(table, distributor_list):
|
|||||||
cols = row.find_all('td')
|
cols = row.find_all('td')
|
||||||
if len(cols) >= 6:
|
if len(cols) >= 6:
|
||||||
title = cols[0].text.strip()
|
title = cols[0].text.strip()
|
||||||
|
href_a = cols[0].find('a')
|
||||||
|
href = href_a['href'] if href_a else ''
|
||||||
year = cols[1].text.strip()
|
year = cols[1].text.strip()
|
||||||
distributor = cols[2].text.strip().lower()
|
distributor = cols[2].text.strip().lower()
|
||||||
notes = cols[3].text.strip()
|
notes = cols[3].text.strip()
|
||||||
@ -279,6 +285,7 @@ def parse_credits_table(table, distributor_list):
|
|||||||
|
|
||||||
movies.append({
|
movies.append({
|
||||||
'title': title,
|
'title': title,
|
||||||
|
'href' : href,
|
||||||
'year': year,
|
'year': year,
|
||||||
'distributor': distributor,
|
'distributor': distributor,
|
||||||
'notes': notes,
|
'notes': notes,
|
||||||
@ -364,8 +371,9 @@ def parse_page_performer(soup):
|
|||||||
data['blacked_cnt'] = distributor_count['blacked']
|
data['blacked_cnt'] = distributor_count['blacked']
|
||||||
data['tushy_cnt'] = distributor_count['tushy']
|
data['tushy_cnt'] = distributor_count['tushy']
|
||||||
data['x_art_cnt'] = distributor_count['x-art']
|
data['x_art_cnt'] = distributor_count['x-art']
|
||||||
|
data['credits'] = credits_list
|
||||||
|
|
||||||
return data, credits_list
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@ import json
|
|||||||
import config
|
import config
|
||||||
import utils
|
import utils
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# 连接 SQLite 数据库
|
# 连接 SQLite 数据库
|
||||||
@ -14,14 +15,112 @@ cursor = conn.cursor()
|
|||||||
def get_current_time():
|
def get_current_time():
|
||||||
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
|
||||||
|
# """从指定表中通过 href 查找 id"""
|
||||||
|
def get_id_by_href(table: str, href: str) -> int:
|
||||||
|
if href is None:
|
||||||
|
return None
|
||||||
|
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
|
# 插入演员索引,来自于列表数据
|
||||||
|
def insert_performer_index(name, href):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR IGNORE INTO iafd_performers (href, name) VALUES (?, ?)
|
||||||
|
""", (
|
||||||
|
href, name
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
performer_id = get_id_by_href('iafd_performers', href)
|
||||||
|
if performer_id:
|
||||||
|
logging.debug(f'insert one performer index, id: {performer_id}, name: {name}, href: {href}')
|
||||||
|
|
||||||
|
return performer_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"未知错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# """插入电影索引,来自于列表数据"""
|
||||||
|
def insert_movie_index(title, href, release_year=0):
|
||||||
|
try:
|
||||||
|
# 插入或更新电影信息
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR IGNORE INTO iafd_movies (title, href, release_year) VALUES (?, ?, ?)
|
||||||
|
""",
|
||||||
|
(title, href, release_year)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
movie_id = get_id_by_href('iafd_movies', href)
|
||||||
|
if movie_id:
|
||||||
|
logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||||
|
|
||||||
|
return movie_id
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 插入演员和电影的关联数据
|
||||||
|
def insert_performer_movie(performer_id, movie_id, role, notes):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes, role=excluded.role
|
||||||
|
""",
|
||||||
|
(performer_id, movie_id, role, notes)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
|
||||||
|
|
||||||
|
return performer_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 插入电影和电影的关联数据
|
||||||
|
def insert_movie_appears_in(movie_id, appears_in_id, gradation=0, notes=''):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT(movie_id, appears_in_id) DO UPDATE SET notes=excluded.notes, gradation=excluded.gradation
|
||||||
|
""",
|
||||||
|
(movie_id, appears_in_id, gradation, notes)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
#logging.debug(f'insert one movie_appears_in, movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||||
|
|
||||||
|
return movie_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# 插入演员信息
|
# 插入演员信息
|
||||||
def insert_or_update_performer(data):
|
def insert_or_update_performer(data):
|
||||||
try:
|
try:
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO iafd_performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
|
INSERT INTO iafd_performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
|
||||||
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
|
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
|
||||||
blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
|
blacked_cnt, tushy_cnt, x_art_cnt, is_full_data, updated_at)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||||
ON CONFLICT(href) DO UPDATE SET
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
name = excluded.name,
|
name = excluded.name,
|
||||||
gender = excluded.gender,
|
gender = excluded.gender,
|
||||||
@ -45,6 +144,7 @@ def insert_or_update_performer(data):
|
|||||||
blacked_cnt = excluded.blacked_cnt,
|
blacked_cnt = excluded.blacked_cnt,
|
||||||
tushy_cnt = excluded.tushy_cnt,
|
tushy_cnt = excluded.tushy_cnt,
|
||||||
x_art_cnt = excluded.x_art_cnt,
|
x_art_cnt = excluded.x_art_cnt,
|
||||||
|
is_full_data = 1,
|
||||||
updated_at = datetime('now', 'localtime')
|
updated_at = datetime('now', 'localtime')
|
||||||
""", (
|
""", (
|
||||||
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
|
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
|
||||||
@ -54,20 +154,36 @@ def insert_or_update_performer(data):
|
|||||||
))
|
))
|
||||||
|
|
||||||
# 获取 performer_id
|
# 获取 performer_id
|
||||||
cursor.execute("SELECT id FROM iafd_performers WHERE href = ?", (data["href"],))
|
performer_id = get_id_by_href('iafd_performers', data["href"])
|
||||||
performer_id = cursor.fetchone()[0]
|
if performer_id is None:
|
||||||
|
return None
|
||||||
# 删除旧的 alias
|
logging.debug(f'insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}')
|
||||||
cursor.execute("DELETE FROM iafd_performer_aliases WHERE performer_id = ?", (performer_id,))
|
|
||||||
|
|
||||||
# 插入新的 alias
|
# 插入新的 alias
|
||||||
#for alias in data.get("performer_aka", []):
|
|
||||||
for alias in data.get("performer_aka") or []:
|
for alias in data.get("performer_aka") or []:
|
||||||
if alias.lower() != "no known aliases":
|
if alias.lower() != "no known aliases":
|
||||||
cursor.execute("INSERT INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ON CONFLICT(performer_id, alias) DO NOTHING ", (performer_id, alias))
|
cursor.execute("INSERT OR IGNORE INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ", (performer_id, alias))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
logging.debug(f"成功插入/更新演员: {data['person']}")
|
|
||||||
|
# 插入影片列表,可能有 personal 和 director 两个身份
|
||||||
|
credits = data['credits']
|
||||||
|
if credits is None :
|
||||||
|
return performer_id
|
||||||
|
for role, movies in credits.items():
|
||||||
|
if movies:
|
||||||
|
for movie in movies:
|
||||||
|
movie_id = get_id_by_href('iafd_movies', movie['href'])
|
||||||
|
# 影片不存在,先插入
|
||||||
|
if movie_id is None:
|
||||||
|
movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']))
|
||||||
|
if movie_id:
|
||||||
|
tmp_id = insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
||||||
|
if tmp_id :
|
||||||
|
logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}, role: {role}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert performer_movie failed. performer_id: {performer_id}, moive href: {movie['href']}')
|
||||||
|
|
||||||
return performer_id
|
return performer_id
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
@ -124,7 +240,7 @@ def query_performer(identifier):
|
|||||||
# 按条件查询 href 列表
|
# 按条件查询 href 列表
|
||||||
def query_performer_hrefs(**filters):
|
def query_performer_hrefs(**filters):
|
||||||
try:
|
try:
|
||||||
sql = "SELECT href FROM iafd_performers WHERE 1=1"
|
sql = "SELECT href, name FROM iafd_performers WHERE 1=1"
|
||||||
params = []
|
params = []
|
||||||
|
|
||||||
if "id" in filters:
|
if "id" in filters:
|
||||||
@ -136,9 +252,17 @@ def query_performer_hrefs(**filters):
|
|||||||
if "name" in filters:
|
if "name" in filters:
|
||||||
sql += " AND name LIKE ?"
|
sql += " AND name LIKE ?"
|
||||||
params.append(f"%{filters['name']}%")
|
params.append(f"%{filters['name']}%")
|
||||||
|
if "is_full_data" in filters:
|
||||||
|
sql += " AND is_full_data = ?"
|
||||||
|
params.append(filters["is_full_data"])
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " limit ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
|
||||||
cursor.execute(sql, params)
|
cursor.execute(sql, params)
|
||||||
return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||||
|
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
@ -303,12 +427,6 @@ def query_studio_hrefs(**filters):
|
|||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# """从指定表中通过 href 查找 id"""
|
|
||||||
def get_id_by_href(table: str, href: str) -> int:
|
|
||||||
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
return row[0] if row else None
|
|
||||||
|
|
||||||
# """插入或更新电影数据"""
|
# """插入或更新电影数据"""
|
||||||
def insert_or_update_movie(movie_data):
|
def insert_or_update_movie(movie_data):
|
||||||
try:
|
try:
|
||||||
@ -316,62 +434,67 @@ def insert_or_update_movie(movie_data):
|
|||||||
distributor_id = get_id_by_href('iafd_distributors', movie_data['DistributorHref'])
|
distributor_id = get_id_by_href('iafd_distributors', movie_data['DistributorHref'])
|
||||||
studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
|
studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
|
||||||
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
|
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
|
||||||
|
# 导演不存在的话,插入一条
|
||||||
|
if director_id is None:
|
||||||
|
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'])
|
||||||
|
|
||||||
# 插入或更新电影信息
|
# 插入或更新电影信息
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"""
|
"""
|
||||||
INSERT INTO iafd_movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
|
INSERT INTO iafd_movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
|
||||||
all_girl, all_male, compilation, webscene, director_id, href, updated_at)
|
all_girl, all_male, compilation, webscene, director_id, href, is_full_data, updated_at)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||||
ON CONFLICT(href) DO UPDATE SET
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
|
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
|
||||||
studio_id=excluded.studio_id, release_date=excluded.release_date,
|
studio_id=excluded.studio_id, release_date=excluded.release_date,
|
||||||
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
|
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
|
||||||
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
|
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
|
||||||
director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
|
director_id=excluded.director_id, is_full_data=1, updated_at = datetime('now', 'localtime')
|
||||||
""",
|
""",
|
||||||
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
|
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
|
||||||
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
|
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
|
||||||
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
logging.debug("Movie inserted/updated: %s", movie_data['title'])
|
|
||||||
|
|
||||||
# 获取插入的 movie_id
|
# 获取插入的 movie_id
|
||||||
cursor.execute("SELECT id FROM iafd_movies WHERE href = ?", (movie_data['href'],))
|
movie_id = get_id_by_href('iafd_movies', movie_data['href'])
|
||||||
movie_id = cursor.fetchone()[0]
|
if movie_id is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}')
|
||||||
|
|
||||||
# 插入 performers_movies 关系表
|
# 插入 performers_movies 关系表
|
||||||
for performer in movie_data.get('Performers', []):
|
for performer in movie_data.get('Performers', []):
|
||||||
performer_id = get_id_by_href('iafd_performers', performer['href'])
|
performer_id = get_id_by_href('iafd_performers', performer['href'])
|
||||||
|
# 如果演员不存在,先插入
|
||||||
|
if performer_id is None:
|
||||||
|
performer_id = insert_performer_index(performer['name'], performer['href'])
|
||||||
if performer_id:
|
if performer_id:
|
||||||
notes = '|'.join(performer['tags'])
|
notes = '|'.join(tag for tag in performer['tags'] if tag != performer['name'])
|
||||||
cursor.execute(
|
tmp_id = insert_performer_movie(performer_id, movie_id, 'personal', notes)
|
||||||
"""
|
if tmp_id:
|
||||||
INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes)
|
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||||
VALUES (?, ?, ?, ?)
|
else:
|
||||||
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
|
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
|
||||||
""",
|
|
||||||
(performer_id, movie_id, "Actor", notes)
|
|
||||||
)
|
|
||||||
logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
|
|
||||||
else:
|
else:
|
||||||
logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
|
||||||
|
|
||||||
# 插入 movies_appers_in 表
|
# 插入 movies_appers_in 表
|
||||||
for appears in movie_data.get("AppearsIn", []):
|
for appears in movie_data.get("AppearsIn", []):
|
||||||
appears_in_id = get_id_by_href('iafd_movies', appears['href'])
|
appears_in_id = get_id_by_href('iafd_movies', appears['href'])
|
||||||
|
# 不存在,先插入
|
||||||
|
if appears_in_id is None:
|
||||||
|
appears_in_id = insert_movie_index( appears['title'], appears['href'])
|
||||||
if appears_in_id:
|
if appears_in_id:
|
||||||
appears_in_id = appears_in_id[0]
|
tmp_id = insert_movie_appears_in(movie_id, appears_in_id)
|
||||||
cursor.execute("""
|
if tmp_id:
|
||||||
INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
logging.debug(f'insert one movie_appears_in record. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||||
VALUES (?, ?, ?, ?)
|
else:
|
||||||
ON CONFLICT(movie_id, appears_in_id) DO NOTHING
|
logging.warning(f'insert movie_appears_in failed. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||||
""", (movie_id, appears_in_id, 1, appears["title"]))
|
|
||||||
else:
|
else:
|
||||||
logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
logging.warning(f'get appears_in_id failed. title: {appears['title']}, href: {appears['href']}')
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
return movie_id
|
return movie_id
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -424,7 +547,7 @@ def query_movies(identifier):
|
|||||||
# 按条件查询 href 列表
|
# 按条件查询 href 列表
|
||||||
def query_movie_hrefs(**filters):
|
def query_movie_hrefs(**filters):
|
||||||
try:
|
try:
|
||||||
sql = "SELECT href FROM iafd_movies WHERE 1=1"
|
sql = "SELECT href, title FROM iafd_movies WHERE 1=1"
|
||||||
params = []
|
params = []
|
||||||
|
|
||||||
if "id" in filters:
|
if "id" in filters:
|
||||||
@ -436,9 +559,16 @@ def query_movie_hrefs(**filters):
|
|||||||
if "title" in filters:
|
if "title" in filters:
|
||||||
sql += " AND title LIKE ?"
|
sql += " AND title LIKE ?"
|
||||||
params.append(f"%{filters['title']}%")
|
params.append(f"%{filters['title']}%")
|
||||||
|
if "is_full_data" in filters:
|
||||||
|
sql += " AND is_full_data = ?"
|
||||||
|
params.append(filters["is_full_data"])
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " limit ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
cursor.execute(sql, params)
|
cursor.execute(sql, params)
|
||||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||||
|
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
@ -457,7 +587,7 @@ def insert_task_log():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# 更新任务日志的字段
|
# 更新任务日志的字段
|
||||||
def update_task_log(task_id, **kwargs):
|
def update_task_log_inner(task_id, **kwargs):
|
||||||
try:
|
try:
|
||||||
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
||||||
params = list(kwargs.values()) + [task_id]
|
params = list(kwargs.values()) + [task_id]
|
||||||
@ -468,30 +598,45 @@ def update_task_log(task_id, **kwargs):
|
|||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||||
|
|
||||||
|
# 更新任务日志的字段
|
||||||
|
def update_task_log(task_id, task_status):
|
||||||
|
try:
|
||||||
|
# 获取 performers、studios 等表的最终行数
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM iafd_performers where is_full_data=1")
|
||||||
|
full_data_performers = cursor.fetchone()[0]
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM iafd_performers")
|
||||||
|
total_performers = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM iafd_movies where is_full_data=1")
|
||||||
|
full_data_movies = cursor.fetchone()[0]
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM iafd_movies")
|
||||||
|
total_movies = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
|
||||||
|
total_distributors = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM iafd_studios")
|
||||||
|
total_studios = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
# 更新 task_log
|
||||||
|
update_task_log_inner(task_id,
|
||||||
|
full_data_performers=full_data_performers,
|
||||||
|
total_performers=total_performers,
|
||||||
|
full_data_movies=full_data_movies,
|
||||||
|
total_movies=total_movies,
|
||||||
|
total_distributors=total_distributors,
|
||||||
|
total_studios=total_studios,
|
||||||
|
task_status=task_status)
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
# 任务结束,更新字段
|
# 任务结束,更新字段
|
||||||
def finalize_task_log(task_id):
|
def finalize_task_log(task_id):
|
||||||
try:
|
try:
|
||||||
# 获取 performers、studios 等表的最终行数
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM iafd_performers")
|
|
||||||
after_performers = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM iafd_movies")
|
|
||||||
after_movies = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
|
|
||||||
after_distributors = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute("SELECT COUNT(*) FROM iafd_studios")
|
|
||||||
after_studios = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
# 更新 task_log
|
# 更新 task_log
|
||||||
update_task_log(task_id,
|
update_task_log(task_id, task_status="Success")
|
||||||
after_performers=after_performers,
|
|
||||||
after_movies=after_movies,
|
|
||||||
after_distributors=after_distributors,
|
|
||||||
after_studios=after_studios,
|
|
||||||
task_status="Success")
|
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logging.error(f"任务 {task_id} 结束失败: {e}")
|
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||||
|
|
||||||
|
|||||||
@ -24,6 +24,13 @@ update_dir = '../result'
|
|||||||
performers_dir = f'{update_dir}/performers'
|
performers_dir = f'{update_dir}/performers'
|
||||||
movies_dir = f'{update_dir}/movies'
|
movies_dir = f'{update_dir}/movies'
|
||||||
|
|
||||||
|
def to_number(value):
|
||||||
|
"""将字符串转换为数字,如果无效则返回 0"""
|
||||||
|
try:
|
||||||
|
return float(value)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return 0
|
||||||
|
|
||||||
def dist_stu_href_rewrite(href):
|
def dist_stu_href_rewrite(href):
|
||||||
# 提取 ID(适用于 distrib 或 studio)
|
# 提取 ID(适用于 distrib 或 studio)
|
||||||
import re
|
import re
|
||||||
|
|||||||
26
scripts/javdb/src/config.py
Normal file
26
scripts/javdb/src/config.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import inspect
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
global_share_data_dir = '/root/sharedata'
|
||||||
|
global_host_data_dir = '/root/hostdir/scripts_data'
|
||||||
|
|
||||||
|
# 设置日志配置
|
||||||
|
def setup_logging(log_filename=None):
|
||||||
|
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||||
|
if log_filename is None:
|
||||||
|
# 获取调用 setup_logging 的脚本文件名
|
||||||
|
caller_frame = inspect.stack()[1]
|
||||||
|
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||||
|
|
||||||
|
# 获取当前日期,格式为 yyyymmdd
|
||||||
|
current_date = datetime.now().strftime('%Y%m%d')
|
||||||
|
# 拼接 log 文件名,将日期加在扩展名前
|
||||||
|
log_filename = f'../log/{caller_filename}_{current_date}.log'
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(log_filename),
|
||||||
|
logging.StreamHandler()
|
||||||
|
])
|
||||||
271
scripts/javdb/src/fetch.py
Normal file
271
scripts/javdb/src/fetch.py
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import csv
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from functools import partial
|
||||||
|
import config
|
||||||
|
import sqlite_utils as db_tools
|
||||||
|
import scraper
|
||||||
|
import utils
|
||||||
|
|
||||||
|
config.setup_logging()
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
force = False
|
||||||
|
|
||||||
|
# 获取演员列表
|
||||||
|
def fetch_actor_list():
|
||||||
|
next_url = scraper.actors_uncensored_base_url
|
||||||
|
while next_url:
|
||||||
|
logging.info(f'fetching page {next_url}')
|
||||||
|
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
# 写入数据库
|
||||||
|
for row in list_data:
|
||||||
|
actor_id = db_tools.insert_actor_index(name=row['name'], href=row['href'] if row['href'] else '')
|
||||||
|
if actor_id:
|
||||||
|
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch actor error. {next_url} ...')
|
||||||
|
|
||||||
|
# 获取makers列表
|
||||||
|
def fetch_makers_list():
|
||||||
|
next_url = scraper.makers_uncensored_base_url
|
||||||
|
while next_url:
|
||||||
|
logging.info(f'fetching page {next_url}')
|
||||||
|
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
# 写入数据库
|
||||||
|
for row in list_data:
|
||||||
|
maker_id = db_tools.insert_or_update_makers(row)
|
||||||
|
if maker_id:
|
||||||
|
logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch actor error. {next_url} ...')
|
||||||
|
|
||||||
|
# 获取series列表
|
||||||
|
def fetch_series_list():
|
||||||
|
next_url = scraper.series_uncensored_base_url
|
||||||
|
while next_url:
|
||||||
|
logging.info(f'fetching page {next_url}')
|
||||||
|
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
# 写入数据库
|
||||||
|
for row in list_data:
|
||||||
|
maker_id = db_tools.insert_or_update_series(row)
|
||||||
|
if maker_id:
|
||||||
|
logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch actor error. {next_url} ...')
|
||||||
|
|
||||||
|
|
||||||
|
# 更新makers列表中的影片信息
|
||||||
|
def fetch_movies_by_maker():
|
||||||
|
url_list = db_tools.query_maker_hrefs()
|
||||||
|
if debug:
|
||||||
|
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||||
|
for url in url_list:
|
||||||
|
next_url = url
|
||||||
|
while True:
|
||||||
|
logging.info(f"Fetching data for maker url {next_url} ...")
|
||||||
|
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||||||
|
if list_data:
|
||||||
|
for movie in list_data:
|
||||||
|
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
|
||||||
|
if tmp_id:
|
||||||
|
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||||
|
# 调试增加brak
|
||||||
|
if debug:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 更新series列表中的影片信息
|
||||||
|
def fetch_movies_by_series():
|
||||||
|
url_list = db_tools.query_series_hrefs()
|
||||||
|
if debug:
|
||||||
|
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||||
|
for url in url_list:
|
||||||
|
next_url = url
|
||||||
|
while True:
|
||||||
|
logging.info(f"Fetching data for series url {next_url} ...")
|
||||||
|
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||||||
|
if list_data:
|
||||||
|
for movie in list_data:
|
||||||
|
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
|
||||||
|
if tmp_id:
|
||||||
|
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||||
|
# 调试增加brak
|
||||||
|
if debug:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# 更新演员信息
|
||||||
|
def fetch_performers_detail():
|
||||||
|
perfomers_list = []
|
||||||
|
while True:
|
||||||
|
# 每次从数据库中取一部分,避免一次全量获取
|
||||||
|
perfomers_list = db_tools.query_actors(is_full_data=0, limit=10)
|
||||||
|
if len(perfomers_list) < 1:
|
||||||
|
logging.info(f'all performers fetched.')
|
||||||
|
break
|
||||||
|
for performer in perfomers_list:
|
||||||
|
url = performer['href']
|
||||||
|
person = performer['name']
|
||||||
|
|
||||||
|
next_url = url
|
||||||
|
all_movies = []
|
||||||
|
while next_url:
|
||||||
|
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
|
||||||
|
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||||||
|
if data:
|
||||||
|
all_movies.extend(data)
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||||
|
|
||||||
|
# 获取完了个人的所有影片,开始插入数据
|
||||||
|
performer_id = db_tools.insert_or_update_actor({
|
||||||
|
'href': url,
|
||||||
|
'name': person,
|
||||||
|
'pic' : '',
|
||||||
|
'alias' : [],
|
||||||
|
'credits':all_movies
|
||||||
|
})
|
||||||
|
if performer_id:
|
||||||
|
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||||
|
# 调试break
|
||||||
|
if debug:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 更新影片信息
|
||||||
|
def fetch_movies_detail():
|
||||||
|
movies_list = []
|
||||||
|
while True:
|
||||||
|
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=10)
|
||||||
|
if len(movies_list) < 1:
|
||||||
|
logging.info(f'all movies fetched.')
|
||||||
|
break
|
||||||
|
for movie in movies_list:
|
||||||
|
url = movie['href']
|
||||||
|
title = movie['title']
|
||||||
|
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
movie_data = scraper.parse_movie_detail(soup, url, title)
|
||||||
|
if movie_data :
|
||||||
|
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||||
|
if movie_id:
|
||||||
|
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie {url} failed.')
|
||||||
|
else:
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
|
# 调试增加break
|
||||||
|
if debug:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 获取更新
|
||||||
|
def check_update():
|
||||||
|
|
||||||
|
# 开启任务
|
||||||
|
task_id = db_tools.insert_task_log()
|
||||||
|
if task_id is None:
|
||||||
|
logging.warning(f'insert task log error.')
|
||||||
|
return None
|
||||||
|
|
||||||
|
if False:
|
||||||
|
# 刷新演员列表
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching actor list')
|
||||||
|
fetch_actor_list()
|
||||||
|
|
||||||
|
# 刷新makers列表
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching maker list')
|
||||||
|
fetch_makers_list()
|
||||||
|
|
||||||
|
# 刷新series列表
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching series list')
|
||||||
|
fetch_series_list()
|
||||||
|
|
||||||
|
# 刷新影片列表
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching movie list by maker')
|
||||||
|
fetch_movies_by_maker()
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching movie list by series')
|
||||||
|
fetch_movies_by_series()
|
||||||
|
|
||||||
|
# 更新演员信息
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching performers')
|
||||||
|
fetch_performers_detail()
|
||||||
|
|
||||||
|
# 更新影片信息
|
||||||
|
db_tools.update_task_log(task_id, task_status='fetching movies')
|
||||||
|
fetch_movies_detail()
|
||||||
|
|
||||||
|
logging.info(f'all process completed!')
|
||||||
|
db_tools.finalize_task_log(task_id)
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# 1,
|
||||||
|
|
||||||
|
# 处理本地数据
|
||||||
|
def load_data():
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 主函数
|
||||||
|
def main(task, args_debug, args_force):
|
||||||
|
global debug
|
||||||
|
debug = args_debug
|
||||||
|
if debug:
|
||||||
|
logging.info('Debug mode enabled.')
|
||||||
|
|
||||||
|
global force
|
||||||
|
force = args_force
|
||||||
|
if force:
|
||||||
|
logging.info('force update for all data.')
|
||||||
|
|
||||||
|
if task == 'fetch':
|
||||||
|
check_update()
|
||||||
|
elif task == 'load':
|
||||||
|
load_data()
|
||||||
|
else:
|
||||||
|
print(f'unkown command. see --help.')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 命令行参数处理
|
||||||
|
parser = argparse.ArgumentParser(description='fetch iafd data.')
|
||||||
|
parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
|
||||||
|
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||||
|
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args.task, args.debug, args.force)
|
||||||
454
scripts/javdb/src/scraper.py
Normal file
454
scripts/javdb/src/scraper.py
Normal file
@ -0,0 +1,454 @@
|
|||||||
|
import cloudscraper
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from requests.exceptions import RequestException
|
||||||
|
from functools import partial
|
||||||
|
import config
|
||||||
|
|
||||||
|
# 定义基础 URL 和可变参数
|
||||||
|
host_url = "https://www.javdb.com"
|
||||||
|
actors_uncensored_base_url = f'{host_url}/actors/uncensored'
|
||||||
|
series_uncensored_base_url = f'{host_url}/series/uncensored'
|
||||||
|
makers_uncensored_base_url = f'{host_url}/makers/uncensored'
|
||||||
|
|
||||||
|
# 设置 headers 和 scraper
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||||
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
if 'javdb.com' not in url.lower():
|
||||||
|
logging.error(f'wrong url format: {url}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
response = scraper.get(url, headers=headers)
|
||||||
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
return soup
|
||||||
|
|
||||||
|
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||||
|
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||||
|
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||||
|
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||||
|
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||||
|
|
||||||
|
logging.error(f'Fetching failed after max retries. {url}')
|
||||||
|
return None # 达到最大重试次数仍然失败
|
||||||
|
|
||||||
|
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||||
|
def preprocess_html(html):
|
||||||
|
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||||
|
|
||||||
|
# 通用的 HTML 结构验证器
|
||||||
|
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||||
|
if attr_type == "id":
|
||||||
|
return soup.find(tag, id=identifier) is not None
|
||||||
|
elif attr_type == "class":
|
||||||
|
return bool(soup.find_all(tag, class_=identifier))
|
||||||
|
elif attr_type == "name":
|
||||||
|
return bool(soup.find('select', {'name': identifier}))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 解析链接中的页码
|
||||||
|
def url_page_num(href):
|
||||||
|
if href is None:
|
||||||
|
return None
|
||||||
|
match = re.search(r'page=(\d+)', href)
|
||||||
|
if match:
|
||||||
|
next_page_number = int(match.group(1))
|
||||||
|
return next_page_number
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_actors_uncensored(soup, href):
|
||||||
|
div_actors = soup.find("div", id='actors')
|
||||||
|
if not div_actors:
|
||||||
|
logging.warning(f"Warning: No actors div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_actors.find_all('div', class_='box actor-box')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
# 获取演员详情链接
|
||||||
|
actor_link = row.find('a')['href']
|
||||||
|
# 获取演员名字
|
||||||
|
actor_name = row.find('strong').text.strip()
|
||||||
|
# 获取头像图片链接
|
||||||
|
avatar_url = row.find('img', class_='avatar')['src']
|
||||||
|
# 获取 title 属性中的别名
|
||||||
|
alias_list = row.find('a')['title'].split(", ")
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'name' : actor_name,
|
||||||
|
'href' : host_url + actor_link if actor_link else '',
|
||||||
|
'pic' : avatar_url,
|
||||||
|
'alias': alias_list
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_actor_detail(soup, href):
|
||||||
|
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href' : host_url + link if link else '',
|
||||||
|
'serial_number' : serial_number,
|
||||||
|
'title' : title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_movie_detail(soup, href, title):
|
||||||
|
div_video = soup.find("div", class_='video-meta-panel')
|
||||||
|
if not div_video:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 获取封面图片
|
||||||
|
cover_img = soup.select_one('.column-video-cover a')
|
||||||
|
cover_url = cover_img['href'] if cover_img else None
|
||||||
|
|
||||||
|
# 获取番号
|
||||||
|
serial = soup.select_one('.panel-block:first-child .value')
|
||||||
|
serial_number = serial.text.strip() if serial else None
|
||||||
|
|
||||||
|
# 获取日期
|
||||||
|
date = soup.select_one('.panel-block:nth-of-type(2) .value')
|
||||||
|
release_date = date.text.strip() if date else None
|
||||||
|
|
||||||
|
# 获取时长
|
||||||
|
duration = soup.select_one('.panel-block:nth-of-type(3) .value')
|
||||||
|
video_duration = duration.text.strip() if duration else None
|
||||||
|
|
||||||
|
# 获取片商
|
||||||
|
maker = soup.select_one('.panel-block:nth-of-type(4) .value a')
|
||||||
|
maker_name = maker.text.strip() if maker else None
|
||||||
|
maker_link = maker['href'] if maker else None
|
||||||
|
|
||||||
|
# 获取系列
|
||||||
|
series = soup.select_one('.panel-block:nth-of-type(5) .value a')
|
||||||
|
series_name = series.text.strip() if series else None
|
||||||
|
series_link = series['href'] if series else None
|
||||||
|
|
||||||
|
# 获取演员(名字 + 链接)
|
||||||
|
actors = [{'name': actor.text.strip(), 'href': host_url + actor['href']} for actor in soup.select('.panel-block:nth-of-type(8) .value a')]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'href' : href,
|
||||||
|
'title' : title,
|
||||||
|
'cover_url': cover_url,
|
||||||
|
'serial_number': serial_number,
|
||||||
|
'release_date': release_date,
|
||||||
|
'duration': video_duration,
|
||||||
|
'maker_name': maker_name,
|
||||||
|
'maker_link': host_url + maker_link if maker_link else '',
|
||||||
|
'series_name': series_name,
|
||||||
|
'series_link': host_url + series_link if series_link else '',
|
||||||
|
'actors': actors
|
||||||
|
}
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_series_uncensored(soup, href):
|
||||||
|
div_series = soup.find("div", id='series')
|
||||||
|
if not div_series:
|
||||||
|
logging.warning(f"Warning: No div_series div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_series.find_all('a', class_='box')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
name = row.find('strong').text.strip()
|
||||||
|
href = row['href']
|
||||||
|
div_movies = row.find('span')
|
||||||
|
movies = 0
|
||||||
|
if div_movies:
|
||||||
|
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||||
|
if match:
|
||||||
|
movies = int(match.group(1))
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'name' : name,
|
||||||
|
'href' : host_url + href if href else '',
|
||||||
|
'movies' : movies
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_series_detail(soup, href):
|
||||||
|
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href' : host_url + link if link else '',
|
||||||
|
'serial_number' : serial_number,
|
||||||
|
'title' : title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_makers_uncensored(soup, href):
|
||||||
|
div_series = soup.find("div", id='makers')
|
||||||
|
if not div_series:
|
||||||
|
logging.warning(f"Warning: No makers div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_series.find_all('a', class_='box')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
name = row.find('strong').text.strip()
|
||||||
|
href = row['href']
|
||||||
|
div_movies = row.find('span')
|
||||||
|
movies = 0
|
||||||
|
if div_movies:
|
||||||
|
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||||
|
if match:
|
||||||
|
movies = int(match.group(1))
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'name' : name,
|
||||||
|
'href' : host_url + href if href else '',
|
||||||
|
'movies' : movies
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_maker_detail(soup, href):
|
||||||
|
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||||
|
if not div_movies:
|
||||||
|
logging.warning(f"Warning: No movies div found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析元素
|
||||||
|
rows = div_movies.find_all('div', class_='item')
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
link = row.find('a', class_='box')['href']
|
||||||
|
serial_number = row.find('strong').text.strip()
|
||||||
|
title = row.find('div', class_='video-title').text.strip()
|
||||||
|
release_date = row.find('div', class_='meta').text.strip()
|
||||||
|
list_data.append({
|
||||||
|
'href' : host_url + link if link else '',
|
||||||
|
'serial_number' : serial_number,
|
||||||
|
'title' : title,
|
||||||
|
'release_date': release_date
|
||||||
|
})
|
||||||
|
|
||||||
|
# 查找 "下一页" 按钮
|
||||||
|
next_page_element = soup.find('a', class_='pagination-next')
|
||||||
|
if next_page_element:
|
||||||
|
next_page_url = next_page_element['href']
|
||||||
|
next_page_number = url_page_num(next_page_url)
|
||||||
|
current_page_number = url_page_num(href)
|
||||||
|
if current_page_number is None:
|
||||||
|
current_page_number = 0
|
||||||
|
if next_page_number and next_page_number > current_page_number :
|
||||||
|
next_url = host_url + next_page_url
|
||||||
|
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###### 以下为测试代码 ######
|
||||||
|
def test_actors_list():
|
||||||
|
next_url = actors_uncensored_base_url
|
||||||
|
while next_url:
|
||||||
|
print(f'fetching page {next_url}')
|
||||||
|
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_actors_uncensored(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
print(list_data)
|
||||||
|
else:
|
||||||
|
print('get wrong page.')
|
||||||
|
if next_url:
|
||||||
|
print(next_url)
|
||||||
|
break
|
||||||
|
|
||||||
|
def test_actor():
|
||||||
|
next_url = 'https://javdb.com/actors/mdRn'
|
||||||
|
all_data = []
|
||||||
|
while next_url:
|
||||||
|
print(f'fetching page {next_url}')
|
||||||
|
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_actor_detail(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
all_data.extend(list_data)
|
||||||
|
else:
|
||||||
|
print('get wrong page.')
|
||||||
|
print(all_data)
|
||||||
|
|
||||||
|
def test_movie_detail():
|
||||||
|
movie_url = 'https://javdb.com/v/gB2Q7'
|
||||||
|
while True:
|
||||||
|
soup = fetch_page(movie_url, partial(generic_validator, tag="div", identifier="video-detail", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
detail = parse_movie_detail(soup, movie_url, 'RED193 無碼 レッドホットフェティッシュコレクション 中出し120連発 4 : 波多野結衣, 愛乃なみ, 夢実あくび, 他多数')
|
||||||
|
if detail:
|
||||||
|
print(detail)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def test_series_list():
|
||||||
|
next_url = 'https://javdb.com/series/uncensored'
|
||||||
|
all_data = []
|
||||||
|
while next_url:
|
||||||
|
print(f'fetching page {next_url}')
|
||||||
|
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_series_uncensored(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
all_data.extend(list_data)
|
||||||
|
else:
|
||||||
|
print('get wrong page.')
|
||||||
|
break
|
||||||
|
|
||||||
|
print(all_data)
|
||||||
|
|
||||||
|
def test_series_detail():
|
||||||
|
next_url = 'https://javdb.com/series/39za'
|
||||||
|
all_data = []
|
||||||
|
while next_url:
|
||||||
|
print(f'fetching page {next_url}')
|
||||||
|
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_series_detail(soup, next_url)
|
||||||
|
if list_data :
|
||||||
|
all_data.extend(list_data)
|
||||||
|
else:
|
||||||
|
print('get wrong page.')
|
||||||
|
print(all_data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
#test_actors_list()
|
||||||
|
#test_actor()
|
||||||
|
test_movie_detail()
|
||||||
|
#test_series_list()
|
||||||
|
#test_series_detail()
|
||||||
|
|
||||||
|
|
||||||
551
scripts/javdb/src/sqlite_utils.py
Normal file
551
scripts/javdb/src/sqlite_utils.py
Normal file
@ -0,0 +1,551 @@
|
|||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import config
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# 连接 SQLite 数据库
|
||||||
|
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
|
||||||
|
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# """从指定表中通过 href 查找 id"""
|
||||||
|
def get_id_by_href(table: str, href: str) -> int:
|
||||||
|
if href is None:
|
||||||
|
return None
|
||||||
|
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
|
# 插入演员索引,来自于列表数据
|
||||||
|
def insert_actor_index(name, href):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR IGNORE INTO javdb_actors (href, name) VALUES (?, ?)
|
||||||
|
""", (
|
||||||
|
href, name
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
performer_id = get_id_by_href('javdb_actors', href)
|
||||||
|
if performer_id:
|
||||||
|
logging.debug(f'insert one actor index, id: {performer_id}, name: {name}, href: {href}')
|
||||||
|
|
||||||
|
return performer_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"未知错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# """插入电影索引,来自于列表数据"""
|
||||||
|
def insert_movie_index(title, href):
|
||||||
|
try:
|
||||||
|
# 插入或更新电影信息
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR IGNORE INTO javdb_movies (title, href) VALUES (?, ?)
|
||||||
|
""",
|
||||||
|
(title, href)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
movie_id = get_id_by_href('javdb_movies', href)
|
||||||
|
if movie_id:
|
||||||
|
logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||||
|
|
||||||
|
return movie_id
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 插入演员和电影的关联数据
|
||||||
|
def insert_actor_movie(performer_id, movie_id, tags=''):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO javdb_actors_movies (actor_id, movie_id, tags)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
ON CONFLICT(actor_id, movie_id) DO UPDATE SET tags=excluded.tags
|
||||||
|
""",
|
||||||
|
(performer_id, movie_id, tags)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
|
||||||
|
|
||||||
|
return performer_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 插入演员数据
|
||||||
|
def insert_or_update_actor(actor):
|
||||||
|
try:
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO javdb_actors (name, href, pic, is_full_data, updated_at)
|
||||||
|
VALUES (?, ?, ?, 1, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET name=excluded.name, pic=excluded.pic, is_full_data=1, updated_at=datetime('now', 'localtime')
|
||||||
|
''', (actor['name'], actor['href'], actor['pic']))
|
||||||
|
|
||||||
|
cursor.execute('SELECT id FROM javdb_actors WHERE href = ?', (actor['href'],))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
actor_id = get_id_by_href('javdb_actors', actor['href'])
|
||||||
|
if actor_id is None:
|
||||||
|
logging.warning(f'insert data error. name: {actor['name']}, href: {actor['href']}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
logging.debug(f'insert one actor, id: {actor_id}, name: {actor['name']}, href: {actor['href']}')
|
||||||
|
|
||||||
|
# 插入别名
|
||||||
|
for alias in actor.get("alias") or []:
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT OR IGNORE INTO javdb_actors_alias (actor_id, alias, updated_at)
|
||||||
|
VALUES (?, ?, datetime('now', 'localtime'))
|
||||||
|
''', (actor_id, alias))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 插入影片列表
|
||||||
|
for movie in actor.get("credits") or []:
|
||||||
|
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||||
|
# 影片不存在,先插入
|
||||||
|
if movie_id is None:
|
||||||
|
movie_id = insert_movie_index(movie['title'], movie['href'])
|
||||||
|
if movie_id:
|
||||||
|
tmp_id = insert_actor_movie(actor_id, movie_id)
|
||||||
|
if tmp_id :
|
||||||
|
logging.debug(f'insert one performer_movie, performer_id: {actor_id}, movie_id: {movie_id}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert performer_movie failed. performer_id: {actor_id}, moive href: {movie['href']}')
|
||||||
|
|
||||||
|
return actor_id
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
|
||||||
|
conn.rollback()
|
||||||
|
|
||||||
|
# 删除演员
|
||||||
|
def delete_actor_by_href(href):
|
||||||
|
try:
|
||||||
|
cursor.execute('DELETE FROM javdb_actors WHERE href = ?', (href,))
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"成功删除演员: {href}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"删除演员 {href} 失败: {e}")
|
||||||
|
conn.rollback()
|
||||||
|
|
||||||
|
# 查询
|
||||||
|
def query_actors(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href, name FROM javdb_actors WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "href" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "name" in filters:
|
||||||
|
sql += " AND name LIKE ?"
|
||||||
|
params.append(f"%{filters['name']}%")
|
||||||
|
if "is_full_data" in filters:
|
||||||
|
sql += " AND is_full_data = ?"
|
||||||
|
params.append(filters["is_full_data"])
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " limit ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||||
|
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# 插入或更新发行商 """
|
||||||
|
def insert_or_update_makers(data):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO javdb_makers (name, href, updated_at)
|
||||||
|
VALUES (?, ? , datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
updated_at = datetime('now', 'localtime')
|
||||||
|
""", (data["name"], data["href"]))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取 performer_id
|
||||||
|
cursor.execute("SELECT id FROM javdb_makers WHERE href = ?", (data["href"],))
|
||||||
|
dist_id = cursor.fetchone()[0]
|
||||||
|
if dist_id:
|
||||||
|
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||||
|
return dist_id
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 删除发行商(按 id 或 name) """
|
||||||
|
def delete_maker(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM javdb_makers WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM javdb_makers WHERE name = ?", (identifier,))
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"成功删除发行商: {identifier}")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"删除失败: {e}")
|
||||||
|
|
||||||
|
# 查询发行商(按 id 或 name) """
|
||||||
|
def query_maker(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM javdb_makers WHERE id = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM javdb_makers WHERE name LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
distributor = cursor.fetchone()
|
||||||
|
if distributor:
|
||||||
|
return dict(zip([desc[0] for desc in cursor.description], distributor))
|
||||||
|
else:
|
||||||
|
logging.warning(f"未找到发行商: {identifier}")
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_maker_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href FROM javdb_makers WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "url" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "name" in filters:
|
||||||
|
sql += " AND name LIKE ?"
|
||||||
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
return [row[0] for row in cursor.fetchall()] # 链接使用小写
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# """ 插入或更新制作公司 """
|
||||||
|
def insert_or_update_series(data):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO javdb_series (name, href, updated_at)
|
||||||
|
VALUES (?, ?, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
updated_at = datetime('now', 'localtime')
|
||||||
|
""", (data["name"], data["href"]))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取 performer_id
|
||||||
|
cursor.execute("SELECT id FROM javdb_series WHERE href = ?", (data["href"],))
|
||||||
|
stu_id = cursor.fetchone()[0]
|
||||||
|
if stu_id:
|
||||||
|
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||||
|
return stu_id
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# """ 删除制作公司(按 id 或 name) """
|
||||||
|
def delete_series(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM javdb_series WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM javdb_series WHERE name = ?", (identifier,))
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"成功删除制作公司: {identifier}")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"删除失败: {e}")
|
||||||
|
|
||||||
|
# """ 查询制作公司(按 id 或 name) """
|
||||||
|
def query_series(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM javdb_series WHERE id = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM javdb_series WHERE name LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
studio = cursor.fetchone()
|
||||||
|
if studio:
|
||||||
|
return dict(zip([desc[0] for desc in cursor.description], studio))
|
||||||
|
else:
|
||||||
|
logging.warning(f"未找到制作公司: {identifier}")
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_series_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href FROM javdb_series WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "href" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "name" in filters:
|
||||||
|
sql += " AND name LIKE ?"
|
||||||
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
return [row[0] for row in cursor.fetchall()] # 链接使用小写
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# """插入或更新电影数据"""
|
||||||
|
def insert_or_update_movie(movie):
|
||||||
|
try:
|
||||||
|
# 获取相关 ID
|
||||||
|
makers_id = get_id_by_href('javdb_makers', movie['maker_link'])
|
||||||
|
series_id = get_id_by_href('javdb_series', movie['series_link'])
|
||||||
|
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
|
||||||
|
maker_id, series_id, is_full_data, updated_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
title=excluded.title,
|
||||||
|
cover_url=excluded.cover_url,
|
||||||
|
serial_number=excluded.serial_number,
|
||||||
|
release_date=excluded.release_date,
|
||||||
|
duration=excluded.duration,
|
||||||
|
maker_id=excluded.maker_id,
|
||||||
|
series_id=excluded.series_id,
|
||||||
|
is_full_data=1,
|
||||||
|
updated_at=datetime('now', 'localtime')
|
||||||
|
""", (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
|
||||||
|
movie['release_date'], movie['duration'], makers_id, series_id))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取插入的 movie_id
|
||||||
|
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||||
|
if movie_id is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}')
|
||||||
|
|
||||||
|
# 插入 performers_movies 关系表
|
||||||
|
for performer in movie.get('actors', []):
|
||||||
|
performer_id = get_id_by_href('javdb_actors', performer['href'])
|
||||||
|
# 如果演员不存在,先插入
|
||||||
|
if performer_id is None:
|
||||||
|
performer_id = insert_actor_index(performer['name'], performer['href'])
|
||||||
|
if performer_id:
|
||||||
|
tmp_id = insert_actor_movie(performer_id, movie_id)
|
||||||
|
if tmp_id:
|
||||||
|
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||||
|
else:
|
||||||
|
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
|
||||||
|
|
||||||
|
return movie_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 删除电影数据"""
|
||||||
|
def delete_movie(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM javdb_movies WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM javdb_movies WHERE href = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
logging.warning("无效的删除参数")
|
||||||
|
return
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"Deleted movie with {identifier}")
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error deleting movie: %s", e)
|
||||||
|
|
||||||
|
# 查找电影数据"""
|
||||||
|
def query_movies(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM javdb_movies WHERE id = ?", (identifier,))
|
||||||
|
elif "http" in identifier:
|
||||||
|
cursor.execute("SELECT * FROM javdb_movies WHERE href = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM javdb_movies WHERE title LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
movie = cursor.fetchone()
|
||||||
|
if movie:
|
||||||
|
cursor.execute("SELECT * FROM javdb_actors_movies WHERE performer_id = ?", (movie[0],))
|
||||||
|
performers = [row[0] for row in cursor.fetchall()]
|
||||||
|
result = dict(zip([desc[0] for desc in cursor.description], performers))
|
||||||
|
result["performers"] = performers
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
logging.warning(f"find no data: {identifier}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_movie_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href, title FROM javdb_movies WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "href" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "title" in filters:
|
||||||
|
sql += " AND title LIKE ?"
|
||||||
|
params.append(f"%{filters['title']}%")
|
||||||
|
if "is_full_data" in filters:
|
||||||
|
sql += " AND is_full_data = ?"
|
||||||
|
params.append(filters["is_full_data"])
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " limit ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||||
|
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 插入一条任务日志
|
||||||
|
def insert_task_log():
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO javdb_task_log (task_status) VALUES ('Start')
|
||||||
|
""")
|
||||||
|
conn.commit()
|
||||||
|
return cursor.lastrowid # 获取插入的 task_id
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"插入任务失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 更新任务日志的字段
|
||||||
|
def update_task_log_inner(task_id, **kwargs):
|
||||||
|
try:
|
||||||
|
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
||||||
|
params = list(kwargs.values()) + [task_id]
|
||||||
|
|
||||||
|
sql = f"UPDATE javdb_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
conn.commit()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||||
|
|
||||||
|
# 更新任务日志的字段
|
||||||
|
def update_task_log(task_id, task_status):
|
||||||
|
try:
|
||||||
|
# 获取 performers、studios 等表的最终行数
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM javdb_actors where is_full_data=1")
|
||||||
|
full_data_actors = cursor.fetchone()[0]
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM javdb_actors")
|
||||||
|
total_actors = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM javdb_movies where is_full_data=1")
|
||||||
|
full_data_movies = cursor.fetchone()[0]
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM javdb_movies")
|
||||||
|
total_movies = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM javdb_makers")
|
||||||
|
total_makers = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM javdb_series")
|
||||||
|
total_series = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
# 更新 task_log
|
||||||
|
update_task_log_inner(task_id,
|
||||||
|
full_data_actors=full_data_actors,
|
||||||
|
total_actors=total_actors,
|
||||||
|
full_data_movies=full_data_movies,
|
||||||
|
total_movies=total_movies,
|
||||||
|
total_makers=total_makers,
|
||||||
|
total_series=total_series,
|
||||||
|
task_status=task_status)
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# 任务结束,更新字段
|
||||||
|
def finalize_task_log(task_id):
|
||||||
|
try:
|
||||||
|
# 更新 task_log
|
||||||
|
update_task_log(task_id, task_status="Success")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# 测试代码
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
'name': '上原亜衣',
|
||||||
|
'href': 'https://www.javdb.com/actors/MkAX',
|
||||||
|
'pic': 'https://c0.jdbstatic.com/avatars/mk/MkAX.jpg',
|
||||||
|
'alias': ['上原亜衣', '下原舞', '早瀬クリスタル', '阿蘇山百式屏風奉行']
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': '大橋未久',
|
||||||
|
'href': 'https://www.javdb.com/actors/21Jp',
|
||||||
|
'pic': 'https://c0.jdbstatic.com/avatars/21/21Jp.jpg',
|
||||||
|
'alias': ['大橋未久']
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for actor in sample_data:
|
||||||
|
insert_or_update_actor(actor)
|
||||||
|
|
||||||
|
print(query_actors("name LIKE '%未久%'"))
|
||||||
|
#delete_actor_by_href('https://www.javdb.com/actors/MkAX')
|
||||||
|
print(query_actors())
|
||||||
0
scripts/javdb/src/utils.py
Normal file
0
scripts/javdb/src/utils.py
Normal file
Reference in New Issue
Block a user