From 7b9b37092b88f52442950f6576392ad10b17a38e Mon Sep 17 00:00:00 2001 From: oscarz Date: Mon, 24 Mar 2025 10:19:33 +0800 Subject: [PATCH] modify scripts --- aabook/src/alter_table.py | 21 +++++++++++++-------- aabook/src/scraper.py | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/aabook/src/alter_table.py b/aabook/src/alter_table.py index 06f4dee..54acc68 100644 --- a/aabook/src/alter_table.py +++ b/aabook/src/alter_table.py @@ -48,20 +48,24 @@ def check_dirty_chapters(): for i in range(100): table_name = f'{tbl_name_chapters_prefix}_{i}' try: - cursor.execute(f"SELECT count(*) FROM {table_name} WHERE updated_at >= '2025-03-23 10:20:00' and updated_at <= '2025-03-23 11:20:00' ") - row = cursor.fetchone() - if row: - dirty_chapters_all += row[0] + #cursor.execute(f"SELECT count(*) FROM {table_name} WHERE updated_at >= '2025-03-23 10:20:00' and updated_at <= '2025-03-23 11:20:00' ") + #cursor.execute(f"SELECT count(*) FROM {table_name} WHERE 1=1 ") + #row = cursor.fetchone() + #if row: + # dirty_chapters_all += row[0] - cursor.execute(f"SELECT count(*) FROM {table_name} WHERE updated_at >= '2025-03-23 10:20:00' and updated_at <= '2025-03-23 11:20:00' and content like '%aabook%' ") + #cursor.execute(f"SELECT count(*) FROM {table_name} WHERE updated_at >= '2025-03-23 10:20:00' and updated_at <= '2025-03-23 11:20:00' and content like '%aabook%' ") + cursor.execute(f"SELECT count(*) FROM {table_name} WHERE 1=1 and (content like '%aabook%' or content like '%疯情%') ") row = cursor.fetchone() if row: - dirty_chapters += row[0] + curr_rows = row[0] + print(f"check {table_name}, dirty pages: {curr_rows}") + dirty_chapters += curr_rows except sqlite3.Error as e: print(f"query error: {e}") - print(f"all: {dirty_chapters_all}, count: {dirty_chapters}") + print(f"dirty pages: {dirty_chapters}") # 检查脏数据 @@ -71,7 +75,8 @@ def update_dirty_chapters(): for i in range(100): table_name = f'{tbl_name_chapters_prefix}_{i}' try: - cursor.execute(f"update {table_name} set has_content = 0 WHERE updated_at >= '2025-03-23 10:20:00' and updated_at <= '2025-03-23 11:20:00' ") + #cursor.execute(f"update {table_name} set has_content = 0 WHERE updated_at >= '2025-03-23 10:20:00' and updated_at <= '2025-03-23 11:20:00' ") + cursor.execute(f"update {table_name} set has_content = 0 WHERE 1=1 and (content like '%aabook%' or content like '%疯情%') ") updated_rows = cursor.rowcount total += updated_rows print(f"update {table_name}, affected rows: {updated_rows}") diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py index 99982cc..c89a390 100644 --- a/aabook/src/scraper.py +++ b/aabook/src/scraper.py @@ -58,7 +58,7 @@ def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retr logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except requests.RequestException as e: - logging.info(f"Warn fetching page {url}: {e}. Retrying ...") + logging.warning(f"fetching page ({url}) error: {e}, Retrying ...") time.sleep(sleep_time) # 休眠指定的时间,然后重试 logging.error(f'Fetching failed after max retries. {url}')