modify scripts
This commit is contained in:
@ -30,24 +30,33 @@ def fetch_performers_by_astro():
|
|||||||
url = scraper.astr_base_url + astro
|
url = scraper.astr_base_url + astro
|
||||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||||
|
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
while True:
|
||||||
if soup:
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||||
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
if soup:
|
||||||
if list_data:
|
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||||
for row in list_data :
|
if list_data:
|
||||||
# 写入演员数据表
|
all_updated = True
|
||||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
|
for row in list_data :
|
||||||
if perfomer_id:
|
# 写入演员数据表
|
||||||
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
|
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
|
||||||
else:
|
if perfomer_id:
|
||||||
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
|
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
|
||||||
|
else:
|
||||||
|
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
|
||||||
|
all_updated = False
|
||||||
|
# 全部写成功,才算完成,进行下一页
|
||||||
|
if all_updated:
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
time.sleep(0.5)
|
||||||
|
elif status_code and status_code == 404:
|
||||||
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
elif status_code and status_code == 404:
|
time.sleep(3)
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
|
||||||
else:
|
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
|
||||||
|
|
||||||
# 调试添加break
|
# 调试添加break
|
||||||
if debug:
|
if debug:
|
||||||
@ -60,23 +69,34 @@ def fetch_performers_by_birth():
|
|||||||
for day in range(1, 32): # 遍历1到31天
|
for day in range(1, 32): # 遍历1到31天
|
||||||
url = scraper.birth_base_url.format(month=month, day=day)
|
url = scraper.birth_base_url.format(month=month, day=day)
|
||||||
logging.info(f"Fetching data for birth, url {url}")
|
logging.info(f"Fetching data for birth, url {url}")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
|
||||||
if soup:
|
while True:
|
||||||
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
||||||
if list_data:
|
if soup:
|
||||||
for row in list_data :
|
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||||
# 写入演员数据表
|
if list_data:
|
||||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
|
all_updated = True
|
||||||
if perfomer_id:
|
for row in list_data :
|
||||||
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
|
# 写入演员数据表
|
||||||
else:
|
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
|
||||||
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
|
if perfomer_id:
|
||||||
|
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
|
||||||
|
all_updated = False
|
||||||
|
# 全部写成功,才算完成,进行下一页
|
||||||
|
if all_updated:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
time.sleep(1)
|
||||||
|
elif status_code and status_code == 404:
|
||||||
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
elif status_code and status_code == 404:
|
time.sleep(3)
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
|
||||||
else:
|
|
||||||
logging.warning(f'fetch astro error. {url} ...')
|
|
||||||
|
|
||||||
# 调试添加break
|
# 调试添加break
|
||||||
if debug:
|
if debug:
|
||||||
@ -119,16 +139,21 @@ def fetch_performers_by_ethnic():
|
|||||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
||||||
parser="lxml", preprocessor=scraper.preprocess_html)
|
parser="lxml", preprocessor=scraper.preprocess_html)
|
||||||
if soup:
|
if soup:
|
||||||
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
list_data, next_page_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||||
if list_data:
|
if list_data:
|
||||||
|
all_updated = True
|
||||||
for row in list_data :
|
for row in list_data :
|
||||||
# 写入演员数据表
|
# 写入演员数据表
|
||||||
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)
|
perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)
|
||||||
if perfomer_id:
|
if perfomer_id:
|
||||||
count += 1
|
count += 1
|
||||||
logging.debug("'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
|
logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
|
||||||
else:
|
else:
|
||||||
logging.warning("'insert performer index failed. name: {row['person']}, href:{row['href']}")
|
logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
|
||||||
|
all_updated = False
|
||||||
|
# 全部写成功,才算完成,进行下一页
|
||||||
|
if all_updated:
|
||||||
|
next_url = next_page_url
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {next_url} ...')
|
logging.warning(f'fetch astro error. {next_url} ...')
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code == 404:
|
||||||
@ -136,6 +161,7 @@ def fetch_performers_by_ethnic():
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logging.warning(f'fetch astro error. {next_url} ...')
|
logging.warning(f'fetch astro error. {next_url} ...')
|
||||||
|
time.sleep(3)
|
||||||
pages +=1
|
pages +=1
|
||||||
|
|
||||||
# 调试添加break
|
# 调试添加break
|
||||||
@ -195,22 +221,32 @@ def fetch_movies_by_dist():
|
|||||||
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
|
url_list = db_tools.query_distributor_hrefs(name='vixen.com')
|
||||||
for url in url_list:
|
for url in url_list:
|
||||||
logging.info(f"Fetching data for distributor url {url} ...")
|
logging.info(f"Fetching data for distributor url {url} ...")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
|
||||||
if soup:
|
while True:
|
||||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||||
if list_data:
|
if soup:
|
||||||
for movie in list_data:
|
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
|
if list_data:
|
||||||
if tmp_id:
|
all_updated = True
|
||||||
logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
|
for movie in list_data:
|
||||||
else:
|
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
|
||||||
logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
|
if tmp_id:
|
||||||
else :
|
logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
else:
|
||||||
elif status_code and status_code == 404:
|
logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
all_updated = False
|
||||||
else:
|
# 全部写成功,才算完成,进行下一页
|
||||||
logging.warning(f'fetching page error. {url}')
|
if all_updated:
|
||||||
|
break
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
time.sleep(1)
|
||||||
|
elif status_code and status_code == 404:
|
||||||
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetching page error. {url}')
|
||||||
|
time.sleep(3)
|
||||||
# 调试增加brak
|
# 调试增加brak
|
||||||
if debug:
|
if debug:
|
||||||
break
|
break
|
||||||
@ -225,22 +261,32 @@ def fetch_movies_by_stu():
|
|||||||
url_list = db_tools.query_studio_hrefs(name='vixen.com')
|
url_list = db_tools.query_studio_hrefs(name='vixen.com')
|
||||||
for url in url_list:
|
for url in url_list:
|
||||||
logging.info(f"Fetching data for studio url {url} ...")
|
logging.info(f"Fetching data for studio url {url} ...")
|
||||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
|
||||||
if soup:
|
while True:
|
||||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||||
if list_data:
|
if soup:
|
||||||
for movie in list_data:
|
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
|
if list_data:
|
||||||
if tmp_id:
|
all_updated = True
|
||||||
logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
|
for movie in list_data:
|
||||||
else:
|
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
|
||||||
logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
|
if tmp_id:
|
||||||
else :
|
logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
else:
|
||||||
elif status_code and status_code == 404:
|
logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
all_updated = False
|
||||||
else:
|
# 全部写成功,才算完成,进行下一页
|
||||||
logging.warning(f'fetching page error. {url}')
|
if all_updated:
|
||||||
|
break
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
time.sleep(1)
|
||||||
|
elif status_code and status_code == 404:
|
||||||
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetching page error. {url}')
|
||||||
|
time.sleep(3)
|
||||||
# 调试增加brak
|
# 调试增加brak
|
||||||
if debug:
|
if debug:
|
||||||
break
|
break
|
||||||
@ -325,17 +371,6 @@ def fetch_performers_detail():
|
|||||||
if debug:
|
if debug:
|
||||||
break
|
break
|
||||||
|
|
||||||
# 获取待更新的演员的列表,这个对账目前做的还有点问题
|
|
||||||
while False:
|
|
||||||
perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
|
|
||||||
if len(perfomers_list) < 1:
|
|
||||||
logging.info(f'all existed performers updated. ')
|
|
||||||
break
|
|
||||||
last_perfomer_id = fetch_performers_detail_once(perfomers_list)
|
|
||||||
logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
|
|
||||||
if debug:
|
|
||||||
break
|
|
||||||
|
|
||||||
# 更新影片信息
|
# 更新影片信息
|
||||||
def fetch_movies_detail():
|
def fetch_movies_detail():
|
||||||
limit_count = 10 if debug else 100
|
limit_count = 10 if debug else 100
|
||||||
|
|||||||
Reference in New Issue
Block a user