From d4e1953e8625fb577e1f3553cda1fcfef2908ac2 Mon Sep 17 00:00:00 2001 From: oscar Date: Tue, 4 Mar 2025 09:56:05 +0800 Subject: [PATCH] modify some scripts. --- .../0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json | 20 --- .../2f582dcf-192e-4adf-9d60-447df8f16b9c.json | 56 -------- .../9af4e9f4-68ce-47ec-a7d7-fde92862af57.json | 70 --------- .../ca753243-8e3a-49ac-88aa-357055187e8c.json | 85 ----------- scripts/iafd/src/fetch.py | 135 ++++++++++++++---- scripts/iafd/src/sqlite_utils.py | 60 +++++++- scripts/iafd/src/utils.py | 41 +++--- 7 files changed, 186 insertions(+), 281 deletions(-) delete mode 100644 scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json delete mode 100644 scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json delete mode 100644 scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json delete mode 100644 scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json diff --git a/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json deleted file mode 100644 index e04eb58..0000000 --- a/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a", - "title": "Barebackin' Men", - "Minutes": "No Data", - "Distributor": "1 Distribution", - "Studio": "1 Distribution", - "ReleaseDate": "No Data", - "AddedtoIAFDDate": "Jan 1, 2006", - "All-Girl": "No", - "All-Male": "Yes", - "Compilation": "No", - "Webscene": "", - "Director": "No Data", - "DirectorHref": "", - "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm", - "StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm", - "Performers": [], - "SceneBreakdowns": [], - "AppearsIn": [] -} \ No newline at end of file diff --git a/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json deleted file mode 100644 index 51f224d..0000000 --- a/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c", - "title": "Slim Goodies POV 2", - "Minutes": "84", - "Distributor": "Exotic Vixen Films", - "Studio": "Exotic Vixen Films", - "ReleaseDate": "No Data", - "AddedtoIAFDDate": "Jan 17, 2024", - "All-Girl": "No", - "All-Male": "No", - "Compilation": "No", - "Webscene": "", - "Director": "Just Mike Starks", - "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", - "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm", - "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm", - "Performers": [ - { - "name": "Amica Mea", - "href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c", - "tags": [ - "Amica Mea" - ] - }, - { - "name": "Baby Breezy", - "href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3", - "tags": [ - "Baby Breezy" - ] - }, - { - "name": "Blu Mere", - "href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905", - "tags": [ - "Blu Mere" - ] - }, - { - "name": "Just Mike Starks", - "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", - "tags": [ - "Just Mike Starks" - ] - }, - { - "name": "Mocha Menage", - "href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd", - "tags": [ - "Mocha Menage" - ] - } - ], - "SceneBreakdowns": [], - "AppearsIn": [] -} \ No newline at end of file diff --git a/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json deleted file mode 100644 index 94cc88d..0000000 --- a/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57", - "title": "Atlanta U: College Freaks", - "Minutes": "No Data", - "Distributor": "Exotic Vixen Films", - "Studio": "Exotic Vixen Films", - "ReleaseDate": "No Data", - "AddedtoIAFDDate": "Sep 19, 2020", - "All-Girl": "No", - "All-Male": "No", - "Compilation": "No", - "Webscene": "", - "Director": "Just Mike Starks", - "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", - "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm", - "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm", - "Performers": [ - { - "name": "Aaliyah Ali", - "href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38", - "tags": [ - "Aaliyah Ali" - ] - }, - { - "name": "Bones Montana", - "href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4", - "tags": [ - "Bones Montana" - ] - }, - { - "name": "Cameron Cox", - "href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26", - "tags": [ - "Cameron Cox" - ] - }, - { - "name": "Crystal Cooper", - "href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980", - "tags": [ - "Crystal Cooper" - ] - }, - { - "name": "Jazmine Adore", - "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4", - "tags": [ - "Jazmine Adore" - ] - }, - { - "name": "Just Mike Starks", - "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", - "tags": [ - "Just Mike Starks" - ] - }, - { - "name": "Lala Ivey", - "href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b", - "tags": [ - "Lala Ivey" - ] - } - ], - "SceneBreakdowns": [], - "AppearsIn": [] -} \ No newline at end of file diff --git a/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json deleted file mode 100644 index 08af9d1..0000000 --- a/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json +++ /dev/null @@ -1,85 +0,0 @@ -{ - "href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c", - "title": "Slim Goodies POV", - "Minutes": "61", - "Distributor": "Exotic Vixen Films", - "Studio": "Exotic Vixen Films", - "ReleaseDate": "No Data", - "AddedtoIAFDDate": "Sep 19, 2020", - "All-Girl": "No", - "All-Male": "No", - "Compilation": "No", - "Webscene": "", - "Director": "Just Mike Starks", - "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", - "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm", - "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm", - "Performers": [ - { - "name": "Gina Ferrero", - "href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd", - "tags": [ - "Gina Ferrero" - ] - }, - { - "name": "Imani Reign", - "href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce", - "tags": [ - "Imani Reign" - ] - }, - { - "name": "Jazmine Adore", - "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4", - "tags": [ - "Jazmine Adore" - ] - }, - { - "name": "Just Mike Starks", - "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f", - "tags": [ - "Just Mike Starks" - ] - }, - { - "name": "Niomie King", - "href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a", - "tags": [ - "Niomie King" - ] - } - ], - "SceneBreakdowns": [ - { - "scene": "Scene 1", - "performers": [ - "Imani Reign", - "Just Mike Starks" - ] - }, - { - "scene": "Scene 2", - "performers": [ - "Jazmine Adore", - "Just Mike Starks" - ] - }, - { - "scene": "Scene 3", - "performers": [ - "Gina Ferrero", - "Just Mike Starks" - ] - }, - { - "scene": "Scene 4", - "performers": [ - "Niomie King", - "Just Mike Starks" - ] - } - ], - "AppearsIn": [] -} \ No newline at end of file diff --git a/scripts/iafd/src/fetch.py b/scripts/iafd/src/fetch.py index 4d670b3..1798b16 100644 --- a/scripts/iafd/src/fetch.py +++ b/scripts/iafd/src/fetch.py @@ -6,13 +6,13 @@ import argparse import logging from functools import partial import config -import sqlite_utils as utils +import sqlite_utils as db_tools import iafd_scraper as scraper -import utils as func +import utils config.setup_logging() -debug = True +debug = False # 按星座获取演员列表,无翻页 def fetch_performers_by_astro(existed_performer_hrefs): @@ -30,7 +30,7 @@ def fetch_performers_by_astro(existed_performer_hrefs): if row['href'] not in existed_performer_hrefs: performers.append({ 'person' : row['person'], - 'href' : row['href'] + 'href' : row['href'].lower() if row['href'] else '' }) else: logging.warning(f'fetch astro error. {url} ...') @@ -59,7 +59,7 @@ def fetch_performers_by_birth(existed_performer_hrefs): if row['href'] not in existed_performer_hrefs: performers.append({ 'person' : row['person'], - 'href' : row['href'] + 'href' : row['href'].lower() if row['href'] else '' }) else: logging.warning(f'fetch astro error. {url} ...') @@ -95,7 +95,7 @@ def fetch_performers_by_ethnic(existed_performer_hrefs): if row['href'] not in existed_performer_hrefs: performers.append({ 'person' : row['person'], - 'href' : row['href'] + 'href' : row['href'].lower() if row['href'] else '' }) else: logging.warning(f'fetch astro error. {url} ...') @@ -124,7 +124,7 @@ def fetch_distributors_list(existed_distributors_href): continue distributors_list.append({ 'name' : row['name'], - 'href' : dis_url + 'href' : dis_url.lower() if dis_url else '' }) else: logging.warning(f'fetch astro error. {url} ...') @@ -148,7 +148,7 @@ def fetch_studios_list(existed_studios_href): continue studios_list.append({ 'name' : row['name'], - 'href' : stu_url + 'href' : stu_url.lower() if stu_url else '' }) else: logging.warning(f'fetch astro error. {url} ...') @@ -159,28 +159,37 @@ def fetch_studios_list(existed_studios_href): # 获取更新 def check_update(): # 读取数据库中的演员列表 - existed_performer_hrefs = utils.query_performer_hrefs() + existed_performer_hrefs = db_tools.query_performer_hrefs() if not existed_performer_hrefs: logging.warning(f'get existed performers from db error.') return None + # 开启任务 + task_id = db_tools.insert_task_log() + if task_id is None: + logging.warning(f'insert task log error.') + return None + # 从列表页获取新的演员 new_performers = [] - #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs)) - #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs)) + if not debug : # 数据量较大,debug 模式下跳过 + new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs)) + new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs)) new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs)) # 逐个获取演员信息,并写入到db中 new_performers = list({item["href"]: item for item in new_performers}.values()) - logging.info(f'get new performers count: {len(new_performers)} ') + logging.info(f'get new performers count: {len(new_performers)} ') + db_tools.update_task_log(task_id, before_performers=len(existed_performer_hrefs), new_performers=len(new_performers), task_status='Inserting new performers') for performer in new_performers: url = performer['href'] person = performer['person'] + logging.info(f"Fetching data for performer {person}, url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) if soup: data, credits = scraper.parse_page_performer(soup) if data: - performer_id = utils.insert_or_update_performer({ + performer_id = db_tools.insert_or_update_performer({ 'href': url, 'person': person, **data @@ -191,7 +200,7 @@ def check_update(): logging.warning(f'insert person: {person} {url} failed.') # 写入到本地json文件 - func.write_person_json(person, url, { + utils.write_person_json(person, url, { 'href': url, 'person': person, **data, @@ -206,33 +215,35 @@ def check_update(): break # 从数据库读取distributors列表 - existed_distributors_href = utils.query_distributor_hrefs() + existed_distributors_href = db_tools.query_distributor_hrefs() if existed_distributors_href is None: logging.warning(f'get existed distributors from db error.') return new_distributors = fetch_distributors_list(existed_distributors_href) + db_tools.update_task_log(task_id, before_distributors=len(existed_distributors_href), new_distributors=len(new_distributors), task_status='Inserting new distributors') for dist in new_distributors: - dist_id = utils.insert_or_update_distributor(dist) + dist_id = db_tools.insert_or_update_distributor(dist) if dist_id: logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}') else: logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}') # 从数据库读取studios列表 - existed_studios_href = utils.query_studio_hrefs() + existed_studios_href = db_tools.query_studio_hrefs() if existed_studios_href is None: logging.warning(f'get existed studios from db error.') return new_studios = fetch_studios_list(existed_studios_href) + db_tools.update_task_log(task_id, before_studios=len(existed_studios_href), new_studios=len(new_studios), task_status='Inserting new studios') for stu in new_studios: - stu_id = utils.insert_or_update_studio(stu) + stu_id = db_tools.insert_or_update_studio(stu) if stu_id: logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}') else: logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}') # 从数据库中读取影片列表 - existed_movies = utils.query_movie_hrefs() + existed_movies = db_tools.query_movie_hrefs() if existed_movies is None: logging.warning(f'load movies from db error') return @@ -240,11 +251,12 @@ def check_update(): new_movie_hrefs = [] # 遍历所有 distributors,获取 movies 列表 - existed_distributors_href = utils.query_distributor_hrefs(name='vixen') + existed_distributors_href = db_tools.query_distributor_hrefs(name='vixen') if existed_distributors_href is None: logging.warning(f'get existed distributors from db error.') return for url in existed_distributors_href: + logging.info(f"Fetching data for distributor url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id")) if soup: list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable') @@ -265,11 +277,12 @@ def check_update(): logging.info(f'all new moives found for distributors, now total new {len(new_movies)}') # 遍历所有 studios,获取 movies 列表 - existed_studios_href = utils.query_studio_hrefs(name='vixen') + existed_studios_href = db_tools.query_studio_hrefs(name='vixen') if existed_studios_href is None: logging.warning(f'get existed studios from db error.') return for url in existed_studios_href: + logging.info(f"Fetching data for studio url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id")) if soup: list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio') @@ -292,21 +305,28 @@ def check_update(): # 对新的影片,逐个获取内容 new_movies = list({item["href"]: item for item in new_movies}.values()) logging.info(f'get merged new movies, count: {len(new_movies)} ') + db_tools.update_task_log(task_id, before_movies=len(existed_movies), new_movies=len(new_movies), task_status='Inserting new movies') for movie in new_movies: url = movie['href'] title = movie['title'] + logging.info(f"Fetching data for movie {title}, url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) if soup: movie_data = scraper.parse_page_movie(soup, url, title) if movie_data : - movie_id = utils.insert_or_update_movie(movie_data) + # 修复url不规范的问题 + if movie_data['DistributorHref']: + movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower()) + if movie_data['StudioHref']: + movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower()) + movie_id = db_tools.insert_or_update_movie(movie_data) if movie_id: - logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}') + logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}') else: logging.warning(f'insert movie {url} failed.') # 写入到本地json文件 - func.write_movie_json(url, movie_data) + utils.write_movie_json(url, movie_data) else: logging.warning(f'parse_page_movie error. url: {url}') else: @@ -315,6 +335,71 @@ def check_update(): if debug: break + # TODO: + # 1, appearsIn 因为影片入库的先后顺序不可控,会出现无法插入 movies_appers_in 表的情况,应该要先记录下待处理的movie,所有记录插入完成后再做处理 + # 2, movie 的更新,涉及到performers的几个统计字段的更新,应该要找到本次tasklog启动后插入到 performers_movies 表里的所有performers,刷新其统计数据;也可以简单粗暴的全量更新 + # 3, 目前performers_movies以movies爬取的信息为主来更新,perfomers爬取的信息应该可以作为检验,尤其是perfomers页面有notes字段 + logging.info(f'all process completed!') + db_tools.finalize_task_log(task_id) + + +# 处理本地数据 +def load_data(): + # 导入已经在本地的 performers 数据 + perfomers_file = '../result/detail.json' + performers_data = utils.read_json(perfomers_file) + if performers_data is None: + print(f'read file error.') + performers_data = [] + for person in performers_data: + performer_id = db_tools.insert_or_update_performer(person) + if performer_id: + logging.info(f'insert one person, id: {performer_id}, person: {person['person']}, url: {person['href']}') + else: + logging.warning(f'insert person: {person['person']}, {person['href']} failed.') + + # 导入已经在本地的 movies 数据 + movies_file = '../result/movie_details.json' + movies_data = utils.read_json(movies_file) + if movies_data is None: + print(f'read file error.') + movies_data = [] + for movie in movies_data: + # 修复url不规范的问题 + if movie['DistributorHref']: + movie['DistributorHref'] = utils.dist_stu_href_rewrite(movie['DistributorHref'].lower()) + if movie['StudioHref']: + movie['StudioHref'] = utils.dist_stu_href_rewrite(movie['StudioHref'].lower()) + movie_id = db_tools.insert_or_update_movie(movie) + if movie_id: + logging.info(f'insert one movie, id: {movie_id}, title: {movie['title']} url: {movie['href']}') + else: + logging.warning(f'insert movie {movie['title']}, {movie['href']} failed.') + + logging.info('task completed.') + + +# 主函数 +def main(task, args_debug): + global debug + debug = args_debug + if debug: + logging.info('Debug mode enabled.') + + if task == 'fetch': + check_update() + elif task == 'load': + load_data() + else: + print(f'unkown command. see --help.') + + if __name__ == "__main__": - check_update() \ No newline at end of file + # 命令行参数处理 + parser = argparse.ArgumentParser(description='fetch iafd data.') + parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)') + parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') + args = parser.parse_args() + + main(args.task, args.debug) diff --git a/scripts/iafd/src/sqlite_utils.py b/scripts/iafd/src/sqlite_utils.py index d2f0c10..5d052b8 100644 --- a/scripts/iafd/src/sqlite_utils.py +++ b/scripts/iafd/src/sqlite_utils.py @@ -137,7 +137,7 @@ def query_performer_hrefs(**filters): params.append(f"%{filters['name']}%") cursor.execute(sql, params) - return [row[0] for row in cursor.fetchall()] + return [row[0].lower() for row in cursor.fetchall()] # 返回小写 except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -217,7 +217,7 @@ def query_distributor_hrefs(**filters): params.append(f"%{filters['name']}%") cursor.execute(sql, params) - return [row[0] for row in cursor.fetchall()] + return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -296,7 +296,7 @@ def query_studio_hrefs(**filters): params.append(f"%{filters['name']}%") cursor.execute(sql, params) - return [row[0] for row in cursor.fetchall()] + return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") @@ -334,7 +334,7 @@ def insert_or_update_movie(movie_data): movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href']) ) conn.commit() - logging.info("Movie inserted/updated: %s", movie_data['title']) + logging.debug("Movie inserted/updated: %s", movie_data['title']) # 获取插入的 movie_id cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],)) @@ -437,12 +437,62 @@ def query_movie_hrefs(**filters): params.append(f"%{filters['title']}%") cursor.execute(sql, params) - return [row[0] for row in cursor.fetchall()] + return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写 except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") return [] +# 插入一条任务日志 +def insert_task_log(): + try: + cursor.execute(""" + INSERT INTO task_log (task_status) VALUES ('Start') + """) + conn.commit() + return cursor.lastrowid # 获取插入的 task_id + except sqlite3.Error as e: + logging.error(f"插入任务失败: {e}") + return None + +# 更新任务日志的字段 +def update_task_log(task_id, **kwargs): + try: + fields = ", ".join(f"{key} = ?" for key in kwargs.keys()) + params = list(kwargs.values()) + [task_id] + + sql = f"UPDATE task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?" + cursor.execute(sql, params) + conn.commit() + except sqlite3.Error as e: + logging.error(f"更新任务 {task_id} 失败: {e}") + +# 任务结束,更新字段 +def finalize_task_log(task_id): + try: + # 获取 performers、studios 等表的最终行数 + cursor.execute("SELECT COUNT(*) FROM performers") + after_performers = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM movies") + after_movies = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM distributors") + after_distributors = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM studios") + after_studios = cursor.fetchone()[0] + + # 更新 task_log + update_task_log(task_id, + after_performers=after_performers, + after_movies=after_movies, + after_distributors=after_distributors, + after_studios=after_studios, + task_status="Success") + + except sqlite3.Error as e: + logging.error(f"任务 {task_id} 结束失败: {e}") if __name__ == "__main__": diff --git a/scripts/iafd/src/utils.py b/scripts/iafd/src/utils.py index 7b3cf82..deac2f2 100644 --- a/scripts/iafd/src/utils.py +++ b/scripts/iafd/src/utils.py @@ -24,27 +24,16 @@ update_dir = '../result' performers_dir = f'{update_dir}/performers' movies_dir = f'{update_dir}/movies' -def uniq_performers(new_performers): - try: - if not isinstance(new_performers, list): - raise TypeError(f"new_performers should be a list, but got {type(new_performers)}") +def dist_stu_href_rewrite(href): + # 提取 ID(适用于 distrib 或 studio) + import re + match = re.search(r"(distrib|studio)=(\d+)", href) + if not match: + return None # 不是目标 URL,返回 None - seen = set() - unique_performers = [] - - for item in new_performers: - if not item or item['href'] is None: - raise ValueError(f"Invalid item in new_performers: {item}") - - if item["href"] not in seen: - seen.add(item["href"]) - unique_performers.append(item) - - return unique_performers - - except Exception as e: - logging.error(f"Error in remove_duplicate_performers: {e}") - return [] # 返回空列表,避免程序崩溃 + key, id_number = match.groups() + new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}" + return new_url # 创建目录 def create_sub_directory(base_dir, str): @@ -90,3 +79,15 @@ def write_movie_json(href, data): except Exception as e: logging.error(f"Error writing file {full_path}: {e}") + +# 读取json文件并返回内容 +def read_json(file_path): + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + print(f"文件 {file_path} 未找到.") + return None + except json.JSONDecodeError: + print(f"文件 {file_path} 解析错误.") + return None