From 118501a4ef263e9ebae59365f249f30af984f5be Mon Sep 17 00:00:00 2001 From: oscarz Date: Tue, 1 Apr 2025 10:03:24 +0800 Subject: [PATCH] modify scripts --- iafd/src/fetch.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index c1a68cc..25f2893 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -103,10 +103,11 @@ def fetch_performers_by_ethnic(): url = row['href'] ethnic = row['name'] next_url = url - + count = 0 + pages = 0 while next_url: - logging.info(f"Fetching data for {ethnic}, url {url} ...") - soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), + logging.info(f"Fetching data for {ethnic}, url {next_url} ...") + soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), parser="lxml", preprocessor=scraper.preprocess_html) if soup: list_data, next_url = scraper.parse_page_ethnic(soup, ethnic) @@ -115,20 +116,24 @@ def fetch_performers_by_ethnic(): # 写入演员数据表 perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1) if perfomer_id: + count += 1 logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}') else: logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}') else: - logging.warning(f'fetch astro error. {url} ...') + logging.warning(f'fetch astro error. {next_url} ...') elif status_code and status_code == 404: logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...') break else: - logging.warning(f'fetch astro error. {url} ...') + logging.warning(f'fetch astro error. {next_url} ...') + pages +=1 # 调试添加break if debug: return True + + logging.info(f"fetched data for {ethnic} total pages: {pages}, total performers: {count}") # 获取distributors列表 def fetch_distributors_list():