modify some scripts.
This commit is contained in:
@ -16,9 +16,7 @@ debug = False
|
||||
force = False
|
||||
|
||||
# 按星座获取演员列表,无翻页
|
||||
def fetch_performers_by_astro(existed_performer_hrefs):
|
||||
performers = []
|
||||
|
||||
def fetch_performers_by_astro():
|
||||
for astro in scraper.astro_list:
|
||||
url = scraper.astr_base_url + astro
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
@ -28,11 +26,13 @@ def fetch_performers_by_astro(existed_performer_hrefs):
|
||||
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href'].lower() if row['href'] else ''
|
||||
})
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
@ -41,13 +41,10 @@ def fetch_performers_by_astro(existed_performer_hrefs):
|
||||
# 调试添加break
|
||||
if debug:
|
||||
break
|
||||
return performers
|
||||
|
||||
|
||||
# 按生日获取演员列表,无翻页
|
||||
def fetch_performers_by_birth(existed_performer_hrefs):
|
||||
performers = []
|
||||
|
||||
def fetch_performers_by_birth():
|
||||
for month in range(1, 13): # 遍历1到12月
|
||||
for day in range(1, 32): # 遍历1到31天
|
||||
url = scraper.birth_base_url.format(month=month, day=day)
|
||||
@ -57,11 +54,12 @@ def fetch_performers_by_birth(existed_performer_hrefs):
|
||||
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href'].lower() if row['href'] else ''
|
||||
})
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
@ -69,18 +67,14 @@ def fetch_performers_by_birth(existed_performer_hrefs):
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
return performers
|
||||
|
||||
return performers
|
||||
return True
|
||||
|
||||
# 处理带空格的种族名
|
||||
def format_ethnic(ethnic):
|
||||
return ethnic.replace(' ', '+')
|
||||
|
||||
# 按人种获取演员列表,有翻页
|
||||
def fetch_performers_by_ethnic(existed_performer_hrefs):
|
||||
performers = []
|
||||
|
||||
def fetch_performers_by_ethnic():
|
||||
for ethnic in scraper.ethnic_list:
|
||||
url = scraper.ethnic_url + format_ethnic(ethnic)
|
||||
next_url = url
|
||||
@ -93,11 +87,12 @@ def fetch_performers_by_ethnic(existed_performer_hrefs):
|
||||
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href'].lower() if row['href'] else ''
|
||||
})
|
||||
# 写入演员数据表
|
||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row['href'].lower() if row['href'] else '')
|
||||
if perfomer_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
@ -105,15 +100,11 @@ def fetch_performers_by_ethnic(existed_performer_hrefs):
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
return performers
|
||||
return performers
|
||||
|
||||
return True
|
||||
|
||||
# 获取distributors列表
|
||||
def fetch_distributors_list(existed_distributors_href):
|
||||
def fetch_distributors_list():
|
||||
url = scraper.distributors_list_url
|
||||
distributors_list = []
|
||||
|
||||
logging.info(f"Fetching data for distributors list, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||
if soup:
|
||||
@ -121,23 +112,17 @@ def fetch_distributors_list(existed_distributors_href):
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
dis_url = scraper.distributors_base_url + row['href']
|
||||
if dis_url in existed_distributors_href :
|
||||
continue
|
||||
distributors_list.append({
|
||||
'name' : row['name'],
|
||||
'href' : dis_url.lower() if dis_url else ''
|
||||
})
|
||||
dist_id = db_tools.insert_or_update_distributor({'name': row['name'], 'href': dis_url})
|
||||
if dist_id:
|
||||
logging.debug(f'insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
return distributors_list
|
||||
|
||||
# 获取studios列表
|
||||
def fetch_studios_list(existed_studios_href):
|
||||
def fetch_studios_list():
|
||||
url = scraper.studios_list_url
|
||||
studios_list = []
|
||||
|
||||
logging.info(f"Fetching data for studios list, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||
if soup:
|
||||
@ -145,205 +130,189 @@ def fetch_studios_list(existed_studios_href):
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
stu_url = scraper.studios_base_url + row['href']
|
||||
if stu_url in existed_studios_href:
|
||||
continue
|
||||
studios_list.append({
|
||||
'name' : row['name'],
|
||||
'href' : stu_url.lower() if stu_url else ''
|
||||
})
|
||||
stu_id = db_tools.insert_or_update_studio({'name': row['name'], 'href': stu_url})
|
||||
if stu_id:
|
||||
logging.debug(f'insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
return studios_list
|
||||
|
||||
|
||||
# 更新distributors列表中的影片信息
|
||||
def fetch_movies_by_dist():
|
||||
url_list = db_tools.query_studio_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
|
||||
for url in url_list:
|
||||
logging.info(f"Fetching data for distributor url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']))
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新distributors列表中的影片信息
|
||||
def fetch_movies_by_stu():
|
||||
url_list = db_tools.query_studio_hrefs()
|
||||
if debug:
|
||||
url_list = db_tools.query_studio_hrefs(name='vixen.com')
|
||||
for url in url_list:
|
||||
logging.info(f"Fetching data for studio url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']))
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 更新演员信息
|
||||
def fetch_performers_detail():
|
||||
perfomers_list = []
|
||||
while True:
|
||||
# 每次从数据库中取一部分,避免一次全量获取
|
||||
perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=1000)
|
||||
if len(perfomers_list) < 1:
|
||||
logging.info(f'all performers fetched.')
|
||||
break
|
||||
for performer in perfomers_list:
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
logging.info(f"Fetching data for performer ({person}), url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
if soup:
|
||||
data = scraper.parse_page_performer(soup)
|
||||
if data:
|
||||
performer_id = db_tools.insert_or_update_performer({
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
if performer_id:
|
||||
logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert person: ({person}) {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
utils.write_person_json(person, url, {
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
# 调试break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 更新影片信息
|
||||
def fetch_movies_detail():
|
||||
movies_list = []
|
||||
while True:
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=1000)
|
||||
if len(movies_list) < 1:
|
||||
logging.info(f'all movies fetched.')
|
||||
break
|
||||
for movie in movies_list:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
if movie_data :
|
||||
# 修复url不规范的问题
|
||||
if movie_data['DistributorHref']:
|
||||
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
|
||||
if movie_data['StudioHref']:
|
||||
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
|
||||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
utils.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 获取更新
|
||||
def check_update():
|
||||
# 读取数据库中的演员列表
|
||||
existed_performer_hrefs = db_tools.query_performer_hrefs()
|
||||
if not existed_performer_hrefs:
|
||||
logging.warning(f'get existed performers from db error.')
|
||||
return None
|
||||
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
# 从列表页获取新的演员
|
||||
new_performers = []
|
||||
if not debug : # 数据量较大,debug 模式下跳过
|
||||
new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
|
||||
new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
|
||||
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
|
||||
# 刷新星座演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching astro list')
|
||||
fetch_performers_by_astro()
|
||||
|
||||
# 逐个获取演员信息,并写入到db中
|
||||
new_performers = list({item["href"]: item for item in new_performers}.values())
|
||||
logging.info(f'get new performers count: {len(new_performers)} ')
|
||||
db_tools.update_task_log(task_id, before_performers=len(existed_performer_hrefs), new_performers=len(new_performers), task_status='Inserting new performers')
|
||||
for performer in new_performers:
|
||||
url = performer['href']
|
||||
person = performer['person']
|
||||
logging.info(f"Fetching data for performer {person}, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
if soup:
|
||||
data, credits = scraper.parse_page_performer(soup)
|
||||
if data:
|
||||
performer_id = db_tools.insert_or_update_performer({
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
if performer_id:
|
||||
logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert person: {person} {url} failed.')
|
||||
# 刷新生日演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching birth list')
|
||||
fetch_performers_by_birth()
|
||||
|
||||
# 写入到本地json文件
|
||||
utils.write_person_json(person, url, {
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data,
|
||||
'credits': credits if credits else {}
|
||||
})
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: {person}, url: {url}')
|
||||
# 调试break
|
||||
if debug:
|
||||
break
|
||||
# 刷新人种演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching ethnic list')
|
||||
fetch_performers_by_ethnic()
|
||||
|
||||
# 从数据库读取distributors列表
|
||||
existed_distributors_href = db_tools.query_distributor_hrefs()
|
||||
if existed_distributors_href is None:
|
||||
logging.warning(f'get existed distributors from db error.')
|
||||
return
|
||||
new_distributors = fetch_distributors_list(existed_distributors_href)
|
||||
db_tools.update_task_log(task_id, before_distributors=len(existed_distributors_href), new_distributors=len(new_distributors), task_status='Inserting new distributors')
|
||||
for dist in new_distributors:
|
||||
dist_id = db_tools.insert_or_update_distributor(dist)
|
||||
if dist_id:
|
||||
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
|
||||
else:
|
||||
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
|
||||
|
||||
# 从数据库读取studios列表
|
||||
existed_studios_href = db_tools.query_studio_hrefs()
|
||||
if existed_studios_href is None:
|
||||
logging.warning(f'get existed studios from db error.')
|
||||
return
|
||||
new_studios = fetch_studios_list(existed_studios_href)
|
||||
db_tools.update_task_log(task_id, before_studios=len(existed_studios_href), new_studios=len(new_studios), task_status='Inserting new studios')
|
||||
for stu in new_studios:
|
||||
stu_id = db_tools.insert_or_update_studio(stu)
|
||||
if stu_id:
|
||||
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
|
||||
else:
|
||||
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
|
||||
|
||||
# 从数据库中读取影片列表
|
||||
existed_movies = db_tools.query_movie_hrefs()
|
||||
if existed_movies is None:
|
||||
logging.warning(f'load movies from db error')
|
||||
return
|
||||
new_movies = []
|
||||
new_movie_hrefs = []
|
||||
# 刷新distributors列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching distributor list')
|
||||
fetch_distributors_list()
|
||||
|
||||
# 遍历所有 distributors,获取 movies 列表
|
||||
existed_distributors_href = db_tools.query_distributor_hrefs(name='vixen')
|
||||
if existed_distributors_href is None:
|
||||
logging.warning(f'get existed distributors from db error.')
|
||||
return
|
||||
for url in existed_distributors_href:
|
||||
logging.info(f"Fetching data for distributor url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
if movie['href'] in existed_movies:
|
||||
continue
|
||||
new_movies.append({
|
||||
'title' : movie['title'],
|
||||
'href' : movie['href']
|
||||
})
|
||||
new_movie_hrefs.append(movie['href'])
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
|
||||
|
||||
# 遍历所有 studios,获取 movies 列表
|
||||
existed_studios_href = db_tools.query_studio_hrefs(name='vixen')
|
||||
if existed_studios_href is None:
|
||||
logging.warning(f'get existed studios from db error.')
|
||||
return
|
||||
for url in existed_studios_href:
|
||||
logging.info(f"Fetching data for studio url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
|
||||
continue
|
||||
new_movies.append({
|
||||
'title' : movie['title'],
|
||||
'href' : movie['href']
|
||||
})
|
||||
new_movie_hrefs.append(movie['href'])
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
|
||||
# 刷新studios列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching studio list')
|
||||
fetch_studios_list()
|
||||
|
||||
# 对新的影片,逐个获取内容
|
||||
new_movies = list({item["href"]: item for item in new_movies}.values())
|
||||
logging.info(f'get merged new movies, count: {len(new_movies)} ')
|
||||
db_tools.update_task_log(task_id, before_movies=len(existed_movies), new_movies=len(new_movies), task_status='Inserting new movies')
|
||||
for movie in new_movies:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie {title}, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
if movie_data :
|
||||
# 修复url不规范的问题
|
||||
if movie_data['DistributorHref']:
|
||||
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
|
||||
if movie_data['StudioHref']:
|
||||
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
|
||||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
# 刷新影片列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by dist')
|
||||
fetch_movies_by_dist()
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by stu')
|
||||
fetch_movies_by_stu()
|
||||
|
||||
# 写入到本地json文件
|
||||
utils.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
break
|
||||
# 更新演员信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching performers')
|
||||
fetch_performers_detail()
|
||||
|
||||
# TODO:
|
||||
# 1, appearsIn 因为影片入库的先后顺序不可控,会出现无法插入 movies_appers_in 表的情况,应该要先记录下待处理的movie,所有记录插入完成后再做处理
|
||||
# 2, movie 的更新,涉及到performers的几个统计字段的更新,应该要找到本次tasklog启动后插入到 performers_movies 表里的所有performers,刷新其统计数据;也可以简单粗暴的全量更新
|
||||
# 3, 目前performers_movies以movies爬取的信息为主来更新,perfomers爬取的信息应该可以作为检验,尤其是perfomers页面有notes字段
|
||||
# 更新影片信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching movies')
|
||||
fetch_movies_detail()
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1, movies 更新之后,要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取
|
||||
# 2, distributors 和 studios 对movie列表的互相检验
|
||||
# 3, 数据不规范问题,可以先手动导入所有 performers 和 movies ,然后用本程序增量获取新的
|
||||
|
||||
# 处理本地数据
|
||||
def load_data():
|
||||
|
||||
@ -40,6 +40,10 @@ scraper = cloudscraper.create_scraper()
|
||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if host_url not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
@ -267,6 +271,8 @@ def parse_credits_table(table, distributor_list):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
href_a = cols[0].find('a')
|
||||
href = href_a['href'] if href_a else ''
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
notes = cols[3].text.strip()
|
||||
@ -279,6 +285,7 @@ def parse_credits_table(table, distributor_list):
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'href' : href,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'notes': notes,
|
||||
@ -364,8 +371,9 @@ def parse_page_performer(soup):
|
||||
data['blacked_cnt'] = distributor_count['blacked']
|
||||
data['tushy_cnt'] = distributor_count['tushy']
|
||||
data['x_art_cnt'] = distributor_count['x-art']
|
||||
data['credits'] = credits_list
|
||||
|
||||
return data, credits_list
|
||||
return data
|
||||
|
||||
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@ import json
|
||||
import config
|
||||
import utils
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# 连接 SQLite 数据库
|
||||
@ -14,14 +15,112 @@ cursor = conn.cursor()
|
||||
def get_current_time():
|
||||
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
# """从指定表中通过 href 查找 id"""
|
||||
def get_id_by_href(table: str, href: str) -> int:
|
||||
if href is None:
|
||||
return None
|
||||
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
# 插入演员索引,来自于列表数据
|
||||
def insert_performer_index(name, href):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO iafd_performers (href, name) VALUES (?, ?)
|
||||
""", (
|
||||
href, name
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
performer_id = get_id_by_href('iafd_performers', href)
|
||||
if performer_id:
|
||||
logging.debug(f'insert one performer index, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
def insert_movie_index(title, href, release_year=0):
|
||||
try:
|
||||
# 插入或更新电影信息
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO iafd_movies (title, href, release_year) VALUES (?, ?, ?)
|
||||
""",
|
||||
(title, href, release_year)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
movie_id = get_id_by_href('iafd_movies', href)
|
||||
if movie_id:
|
||||
logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
|
||||
return movie_id
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
# 插入演员和电影的关联数据
|
||||
def insert_performer_movie(performer_id, movie_id, role, notes):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes, role=excluded.role
|
||||
""",
|
||||
(performer_id, movie_id, role, notes)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
#logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
# 插入电影和电影的关联数据
|
||||
def insert_movie_appears_in(movie_id, appears_in_id, gradation=0, notes=''):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, appears_in_id) DO UPDATE SET notes=excluded.notes, gradation=excluded.gradation
|
||||
""",
|
||||
(movie_id, appears_in_id, gradation, notes)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
#logging.debug(f'insert one movie_appears_in, movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# 插入演员信息
|
||||
def insert_or_update_performer(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
|
||||
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
|
||||
blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||
blacked_cnt, tushy_cnt, x_art_cnt, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
gender = excluded.gender,
|
||||
@ -45,6 +144,7 @@ def insert_or_update_performer(data):
|
||||
blacked_cnt = excluded.blacked_cnt,
|
||||
tushy_cnt = excluded.tushy_cnt,
|
||||
x_art_cnt = excluded.x_art_cnt,
|
||||
is_full_data = 1,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (
|
||||
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
|
||||
@ -54,20 +154,36 @@ def insert_or_update_performer(data):
|
||||
))
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM iafd_performers WHERE href = ?", (data["href"],))
|
||||
performer_id = cursor.fetchone()[0]
|
||||
|
||||
# 删除旧的 alias
|
||||
cursor.execute("DELETE FROM iafd_performer_aliases WHERE performer_id = ?", (performer_id,))
|
||||
performer_id = get_id_by_href('iafd_performers', data["href"])
|
||||
if performer_id is None:
|
||||
return None
|
||||
logging.debug(f'insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}')
|
||||
|
||||
# 插入新的 alias
|
||||
#for alias in data.get("performer_aka", []):
|
||||
for alias in data.get("performer_aka") or []:
|
||||
if alias.lower() != "no known aliases":
|
||||
cursor.execute("INSERT INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ON CONFLICT(performer_id, alias) DO NOTHING ", (performer_id, alias))
|
||||
cursor.execute("INSERT OR IGNORE INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ", (performer_id, alias))
|
||||
|
||||
conn.commit()
|
||||
logging.debug(f"成功插入/更新演员: {data['person']}")
|
||||
|
||||
# 插入影片列表,可能有 personal 和 director 两个身份
|
||||
credits = data['credits']
|
||||
if credits is None :
|
||||
return performer_id
|
||||
for role, movies in credits.items():
|
||||
if movies:
|
||||
for movie in movies:
|
||||
movie_id = get_id_by_href('iafd_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']))
|
||||
if movie_id:
|
||||
tmp_id = insert_performer_movie(performer_id, movie_id, role, movie['notes'])
|
||||
if tmp_id :
|
||||
logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}, role: {role}')
|
||||
else:
|
||||
logging.warning(f'insert performer_movie failed. performer_id: {performer_id}, moive href: {movie['href']}')
|
||||
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
@ -124,7 +240,7 @@ def query_performer(identifier):
|
||||
# 按条件查询 href 列表
|
||||
def query_performer_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM iafd_performers WHERE 1=1"
|
||||
sql = "SELECT href, name FROM iafd_performers WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
@ -136,9 +252,17 @@ def query_performer_hrefs(**filters):
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
if "is_full_data" in filters:
|
||||
sql += " AND is_full_data = ?"
|
||||
params.append(filters["is_full_data"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
@ -303,12 +427,6 @@ def query_studio_hrefs(**filters):
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# """从指定表中通过 href 查找 id"""
|
||||
def get_id_by_href(table: str, href: str) -> int:
|
||||
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
# """插入或更新电影数据"""
|
||||
def insert_or_update_movie(movie_data):
|
||||
try:
|
||||
@ -316,62 +434,67 @@ def insert_or_update_movie(movie_data):
|
||||
distributor_id = get_id_by_href('iafd_distributors', movie_data['DistributorHref'])
|
||||
studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
|
||||
director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
|
||||
# 导演不存在的话,插入一条
|
||||
if director_id is None:
|
||||
director_id = insert_performer_index( movie_data['Director'], movie_data['DirectorHref'])
|
||||
|
||||
# 插入或更新电影信息
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO iafd_movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
|
||||
all_girl, all_male, compilation, webscene, director_id, href, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||
all_girl, all_male, compilation, webscene, director_id, href, is_full_data, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
|
||||
studio_id=excluded.studio_id, release_date=excluded.release_date,
|
||||
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
|
||||
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
|
||||
director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
|
||||
director_id=excluded.director_id, is_full_data=1, updated_at = datetime('now', 'localtime')
|
||||
""",
|
||||
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
|
||||
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
|
||||
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
||||
)
|
||||
conn.commit()
|
||||
logging.debug("Movie inserted/updated: %s", movie_data['title'])
|
||||
|
||||
# 获取插入的 movie_id
|
||||
cursor.execute("SELECT id FROM iafd_movies WHERE href = ?", (movie_data['href'],))
|
||||
movie_id = cursor.fetchone()[0]
|
||||
movie_id = get_id_by_href('iafd_movies', movie_data['href'])
|
||||
if movie_id is None:
|
||||
return None
|
||||
|
||||
logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}')
|
||||
|
||||
# 插入 performers_movies 关系表
|
||||
for performer in movie_data.get('Performers', []):
|
||||
performer_id = get_id_by_href('iafd_performers', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = insert_performer_index(performer['name'], performer['href'])
|
||||
if performer_id:
|
||||
notes = '|'.join(performer['tags'])
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
|
||||
""",
|
||||
(performer_id, movie_id, "Actor", notes)
|
||||
)
|
||||
logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
|
||||
notes = '|'.join(tag for tag in performer['tags'] if tag != performer['name'])
|
||||
tmp_id = insert_performer_movie(performer_id, movie_id, 'personal', notes)
|
||||
if tmp_id:
|
||||
logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
|
||||
else:
|
||||
logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
|
||||
else:
|
||||
logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
||||
logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
|
||||
|
||||
# 插入 movies_appers_in 表
|
||||
for appears in movie_data.get("AppearsIn", []):
|
||||
appears_in_id = get_id_by_href('iafd_movies', appears['href'])
|
||||
# 不存在,先插入
|
||||
if appears_in_id is None:
|
||||
appears_in_id = insert_movie_index( appears['title'], appears['href'])
|
||||
if appears_in_id:
|
||||
appears_in_id = appears_in_id[0]
|
||||
cursor.execute("""
|
||||
INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, appears_in_id) DO NOTHING
|
||||
""", (movie_id, appears_in_id, 1, appears["title"]))
|
||||
tmp_id = insert_movie_appears_in(movie_id, appears_in_id)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie_appears_in record. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||
else:
|
||||
logging.warning(f'insert movie_appears_in failed. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
|
||||
else:
|
||||
logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
||||
|
||||
conn.commit()
|
||||
logging.warning(f'get appears_in_id failed. title: {appears['title']}, href: {appears['href']}')
|
||||
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
@ -424,7 +547,7 @@ def query_movies(identifier):
|
||||
# 按条件查询 href 列表
|
||||
def query_movie_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM iafd_movies WHERE 1=1"
|
||||
sql = "SELECT href, title FROM iafd_movies WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
@ -436,9 +559,16 @@ def query_movie_hrefs(**filters):
|
||||
if "title" in filters:
|
||||
sql += " AND title LIKE ?"
|
||||
params.append(f"%{filters['title']}%")
|
||||
if "is_full_data" in filters:
|
||||
sql += " AND is_full_data = ?"
|
||||
params.append(filters["is_full_data"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
#return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
@ -457,7 +587,7 @@ def insert_task_log():
|
||||
return None
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log(task_id, **kwargs):
|
||||
def update_task_log_inner(task_id, **kwargs):
|
||||
try:
|
||||
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
||||
params = list(kwargs.values()) + [task_id]
|
||||
@ -468,30 +598,45 @@ def update_task_log(task_id, **kwargs):
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log(task_id, task_status):
|
||||
try:
|
||||
# 获取 performers、studios 等表的最终行数
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_performers where is_full_data=1")
|
||||
full_data_performers = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_performers")
|
||||
total_performers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_movies where is_full_data=1")
|
||||
full_data_movies = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_movies")
|
||||
total_movies = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
|
||||
total_distributors = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_studios")
|
||||
total_studios = cursor.fetchone()[0]
|
||||
|
||||
# 更新 task_log
|
||||
update_task_log_inner(task_id,
|
||||
full_data_performers=full_data_performers,
|
||||
total_performers=total_performers,
|
||||
full_data_movies=full_data_movies,
|
||||
total_movies=total_movies,
|
||||
total_distributors=total_distributors,
|
||||
total_studios=total_studios,
|
||||
task_status=task_status)
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
|
||||
# 任务结束,更新字段
|
||||
def finalize_task_log(task_id):
|
||||
try:
|
||||
# 获取 performers、studios 等表的最终行数
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_performers")
|
||||
after_performers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_movies")
|
||||
after_movies = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
|
||||
after_distributors = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM iafd_studios")
|
||||
after_studios = cursor.fetchone()[0]
|
||||
|
||||
# 更新 task_log
|
||||
update_task_log(task_id,
|
||||
after_performers=after_performers,
|
||||
after_movies=after_movies,
|
||||
after_distributors=after_distributors,
|
||||
after_studios=after_studios,
|
||||
task_status="Success")
|
||||
|
||||
update_task_log(task_id, task_status="Success")
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||
|
||||
|
||||
@ -24,6 +24,13 @@ update_dir = '../result'
|
||||
performers_dir = f'{update_dir}/performers'
|
||||
movies_dir = f'{update_dir}/movies'
|
||||
|
||||
def to_number(value):
|
||||
"""将字符串转换为数字,如果无效则返回 0"""
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
|
||||
def dist_stu_href_rewrite(href):
|
||||
# 提取 ID(适用于 distrib 或 studio)
|
||||
import re
|
||||
|
||||
Reference in New Issue
Block a user