From 8cd0a67b641dc57ec7debccd3490466b153548e6 Mon Sep 17 00:00:00 2001 From: oscarz Date: Tue, 1 Jul 2025 16:39:25 +0800 Subject: [PATCH] modify scripts --- iafd/src/fetch.py | 187 +++++++++++++++++++++++++++------------------- 1 file changed, 111 insertions(+), 76 deletions(-) diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index 1a027b0..bc31628 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -30,24 +30,33 @@ def fetch_performers_by_astro(): url = scraper.astr_base_url + astro logging.info(f"Fetching data for {astro}, url {url} ...") - soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id")) - if soup: - list_data, next_url = scraper.parse_page_astro(soup, astro) - if list_data: - for row in list_data : - # 写入演员数据表 - perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1) - if perfomer_id: - logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}") - else: - logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}") + while True: + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id")) + if soup: + list_data, next_url = scraper.parse_page_astro(soup, astro) + if list_data: + all_updated = True + for row in list_data : + # 写入演员数据表 + perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1) + if perfomer_id: + logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}") + else: + logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}") + all_updated = False + # 全部写成功,才算完成,进行下一页 + if all_updated: + break + else: + logging.warning(f'fetch astro error. {url} ...') + time.sleep(0.5) + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + break else: logging.warning(f'fetch astro error. {url} ...') - elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') - else: - logging.warning(f'fetch astro error. {url} ...') + time.sleep(3) # 调试添加break if debug: @@ -60,23 +69,34 @@ def fetch_performers_by_birth(): for day in range(1, 32): # 遍历1到31天 url = scraper.birth_base_url.format(month=month, day=day) logging.info(f"Fetching data for birth, url {url}") - soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class")) - if soup: - list_data, next_url = scraper.parse_page_birth(soup, month, day) - if list_data: - for row in list_data : - # 写入演员数据表 - perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1) - if perfomer_id: - logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}") - else: - logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}") + + while True: + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class")) + if soup: + list_data, next_url = scraper.parse_page_birth(soup, month, day) + if list_data: + all_updated = True + for row in list_data : + # 写入演员数据表 + perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1) + if perfomer_id: + logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}") + break + else: + logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}") + all_updated = False + # 全部写成功,才算完成,进行下一页 + if all_updated: + break + else: + logging.warning(f'fetch astro error. {url} ...') + time.sleep(1) + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + break else: logging.warning(f'fetch astro error. {url} ...') - elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') - else: - logging.warning(f'fetch astro error. {url} ...') + time.sleep(3) # 调试添加break if debug: @@ -119,16 +139,21 @@ def fetch_performers_by_ethnic(): soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), parser="lxml", preprocessor=scraper.preprocess_html) if soup: - list_data, next_url = scraper.parse_page_ethnic(soup, ethnic) + list_data, next_page_url = scraper.parse_page_ethnic(soup, ethnic) if list_data: + all_updated = True for row in list_data : # 写入演员数据表 perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1) if perfomer_id: count += 1 - logging.debug("'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}") + logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}") else: - logging.warning("'insert performer index failed. name: {row['person']}, href:{row['href']}") + logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}") + all_updated = False + # 全部写成功,才算完成,进行下一页 + if all_updated: + next_url = next_page_url else: logging.warning(f'fetch astro error. {next_url} ...') elif status_code and status_code == 404: @@ -136,6 +161,7 @@ def fetch_performers_by_ethnic(): break else: logging.warning(f'fetch astro error. {next_url} ...') + time.sleep(3) pages +=1 # 调试添加break @@ -195,22 +221,32 @@ def fetch_movies_by_dist(): url_list = db_tools.query_distributor_hrefs(name='vixen.com') for url in url_list: logging.info(f"Fetching data for distributor url {url} ...") - soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id")) - if soup: - list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable') - if list_data: - for movie in list_data: - tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1) - if tmp_id: - logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}") - else: - logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}") - else : - logging.warning(f'parse_page_movie error. url: {url}') - elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') - else: - logging.warning(f'fetching page error. {url}') + + while True: + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id")) + if soup: + list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable') + if list_data: + all_updated = True + for movie in list_data: + tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1) + if tmp_id: + logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}") + else: + logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}") + all_updated = False + # 全部写成功,才算完成,进行下一页 + if all_updated: + break + else : + logging.warning(f'parse_page_movie error. url: {url}') + time.sleep(1) + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + break + else: + logging.warning(f'fetching page error. {url}') + time.sleep(3) # 调试增加brak if debug: break @@ -225,22 +261,32 @@ def fetch_movies_by_stu(): url_list = db_tools.query_studio_hrefs(name='vixen.com') for url in url_list: logging.info(f"Fetching data for studio url {url} ...") - soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id")) - if soup: - list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio') - if list_data: - for movie in list_data: - tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1) - if tmp_id: - logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}") - else: - logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}") - else : - logging.warning(f'parse_page_movie error. url: {url}') - elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') - else: - logging.warning(f'fetching page error. {url}') + + while True: + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id")) + if soup: + list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio') + if list_data: + all_updated = True + for movie in list_data: + tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1) + if tmp_id: + logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}") + else: + logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}") + all_updated = False + # 全部写成功,才算完成,进行下一页 + if all_updated: + break + else : + logging.warning(f'parse_page_movie error. url: {url}') + time.sleep(1) + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + break + else: + logging.warning(f'fetching page error. {url}') + time.sleep(3) # 调试增加brak if debug: break @@ -325,17 +371,6 @@ def fetch_performers_detail(): if debug: break - # 获取待更新的演员的列表,这个对账目前做的还有点问题 - while False: - perfomers_list = db_tools.get_performers_needed_update(limit=limit_count) - if len(perfomers_list) < 1: - logging.info(f'all existed performers updated. ') - break - last_perfomer_id = fetch_performers_detail_once(perfomers_list) - logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}') - if debug: - break - # 更新影片信息 def fetch_movies_detail(): limit_count = 10 if debug else 100