From a6f99a2551397aeaa704331bdde0a1b9a0625c98 Mon Sep 17 00:00:00 2001 From: oscarz Date: Thu, 24 Apr 2025 14:48:02 +0800 Subject: [PATCH] modify scripts --- iafd/src/fetch.py | 21 ++++++++++++++++++--- iafd/src/sqlite_utils.py | 8 ++++---- javdb/src/fetch.py | 17 ++++++++++++++--- javdb/src/sqlite_utils.py | 4 ++-- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index 88943af..001bab6 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -14,6 +14,7 @@ config.setup_logging() debug = False force = False +skip_local = True # 按星座获取演员列表,无翻页 def fetch_performers_by_astro(): @@ -242,8 +243,13 @@ def fetch_performers_detail_once(perfomers_list): for performer in perfomers_list: url = performer['href'] person = performer['name'] + curr_id = performer['id'] logging.debug(f"Fetching data for performer ({person}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) + # 从本地读取的文件,忽略 + if skip_local and status_code == 99 : + last_performer_id = curr_id + continue if soup: data = scraper.parse_page_performer(soup, url) if data: @@ -326,8 +332,13 @@ def fetch_movies_detail(): for movie in movies_list: url = movie['href'] title = movie['title'] - logging.debug(f"Fetching data for movie ({title}), url {url} ...") + curr_id = movie['id'] + logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) + # 从本地读取的文件,忽略 + if skip_local and status_code == 99 : + last_movie_id = curr_id + continue if soup: movie_data = scraper.parse_page_movie(soup, url, title) if movie_data : @@ -378,7 +389,7 @@ function_map = { } # 主函数 -def main(cmd, args_debug, args_force): +def main(cmd, args_debug, args_force, args_skip_local): global debug debug = args_debug if debug: @@ -388,6 +399,9 @@ def main(cmd, args_debug, args_force): global force force = args_force + global skip_local + skip_local = args_skip_local + # 开启任务 task_id = db_tools.insert_task_log() if task_id is None: @@ -431,6 +445,7 @@ if __name__ == "__main__": parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)') + parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)') args = parser.parse_args() - main(args.cmd, args.debug, args.force) + main(args.cmd, args.debug, args.force, args.skip_local) diff --git a/iafd/src/sqlite_utils.py b/iafd/src/sqlite_utils.py index b47e380..7a3ccbc 100644 --- a/iafd/src/sqlite_utils.py +++ b/iafd/src/sqlite_utils.py @@ -329,7 +329,7 @@ def query_performer(identifier): # 按条件查询 href 列表 def query_performer_hrefs(**filters): try: - sql = "SELECT href, name FROM iafd_performers WHERE 1=1" + sql = "SELECT href, name, id FROM iafd_performers WHERE 1=1" params = [] if "id" in filters: @@ -374,7 +374,7 @@ def query_performer_hrefs(**filters): logging.debug(f"query sql: {sql}") cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 返回小写 - return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()] + return [{'href': row[0], 'name': row[1], 'id':row[2]} for row in cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -756,7 +756,7 @@ def query_movies(identifier): # 按条件查询 href 列表 def query_movie_hrefs(**filters): try: - sql = "SELECT href, title FROM iafd_movies WHERE 1=1" + sql = "SELECT href, title, id FROM iafd_movies WHERE 1=1" params = [] if "id" in filters: @@ -802,7 +802,7 @@ def query_movie_hrefs(**filters): logging.debug(f"query sql: {sql}") cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 - return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()] + return [{'href': row[0], 'title': row[1], 'id':row[2]} for row in cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py index 781fb5a..cf4824e 100644 --- a/javdb/src/fetch.py +++ b/javdb/src/fetch.py @@ -14,6 +14,7 @@ config.setup_logging() debug = False force = False +skip_local = True # 获取演员列表 def fetch_actor_list(): @@ -236,8 +237,14 @@ def fetch_movies_detail(): for movie in movies_list: url = movie['href'] title = movie['title'] + curr_id = movie['id'] logging.debug(f"Fetching data for movie ({title}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class")) + # 从本地读取的文件,忽略 + if skip_local and status_code == 99 : + last_movie_id = curr_id + continue + # 解析页面,写入数据库 if soup: movie_data = scraper.parse_movie_detail(soup, url, title) if movie_data : @@ -278,7 +285,7 @@ function_map = { } # 主函数 -def main(cmd, args_debug, args_force): +def main(cmd, args_debug, args_force, args_skip_local): global debug debug = args_debug if debug: @@ -288,13 +295,16 @@ def main(cmd, args_debug, args_force): global force force = args_force + global skip_local + skip_local = args_skip_local + # 开启任务 task_id = db_tools.insert_task_log() if task_id is None: logging.warning(f'insert task log error.') return None - logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}') + logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, skip_local: {skip_local}, cmd: {cmd}') # 执行指定的函数 if cmd: @@ -328,6 +338,7 @@ if __name__ == "__main__": parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)') + parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)') args = parser.parse_args() - main(args.cmd, args.debug, args.force) + main(args.cmd, args.debug, args.force, args.skip_local) diff --git a/javdb/src/sqlite_utils.py b/javdb/src/sqlite_utils.py index cff7943..864f798 100644 --- a/javdb/src/sqlite_utils.py +++ b/javdb/src/sqlite_utils.py @@ -595,7 +595,7 @@ def query_movies(identifier): # 按条件查询 href 列表 def query_movie_hrefs(**filters): try: - sql = "SELECT href, title FROM javdb_movies WHERE 1=1" + sql = "SELECT href, title, id FROM javdb_movies WHERE 1=1" params = [] if "id" in filters: @@ -643,7 +643,7 @@ def query_movie_hrefs(**filters): cursor.execute(sql, params) #return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 - return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()] + return [{'href': row[0], 'title': row[1], 'id':row[2]} for row in cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}")