modify some scripts.
This commit is contained in:
@ -22,6 +22,7 @@ INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
|
|||||||
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
|
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
|
||||||
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
|
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
|
||||||
BATCH_SIZE = 100 # 每100条数据写入文件
|
BATCH_SIZE = 100 # 每100条数据写入文件
|
||||||
|
movies_dir = f'{RESULT_DIR}/movies'
|
||||||
|
|
||||||
# 初始化 Cloudflare 绕过工具
|
# 初始化 Cloudflare 绕过工具
|
||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
@ -197,6 +198,34 @@ def parse_movie_details(html, href, title):
|
|||||||
"AppearsIn": appears_in,
|
"AppearsIn": appears_in,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 创建目录
|
||||||
|
def create_sub_directory(base_dir, str):
|
||||||
|
# 获取 person 的前两个字母并转为小写
|
||||||
|
sub_dir = str[:1].lower()
|
||||||
|
full_path = os.path.join(base_dir, sub_dir)
|
||||||
|
if not os.path.exists(full_path):
|
||||||
|
os.makedirs(full_path)
|
||||||
|
return full_path
|
||||||
|
|
||||||
|
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||||
|
def extract_id_from_href(href):
|
||||||
|
"""从href中提取id参数"""
|
||||||
|
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||||
|
return match.group(1) if match else ''
|
||||||
|
|
||||||
|
# 写入每个 performer 的单独 JSON 文件
|
||||||
|
def write_movie_json(href, data):
|
||||||
|
# 获取目录
|
||||||
|
movie_id = extract_id_from_href(href)
|
||||||
|
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||||
|
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(person_dir, person_filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error writing file {full_path}: {e}")
|
||||||
|
|
||||||
def process_movies():
|
def process_movies():
|
||||||
"""处理电影数据"""
|
"""处理电影数据"""
|
||||||
@ -233,6 +262,9 @@ def process_movies():
|
|||||||
else:
|
else:
|
||||||
all_movies.append(movie)
|
all_movies.append(movie)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
# 写入本地文件
|
||||||
|
write_movie_json(href, movie)
|
||||||
break
|
break
|
||||||
|
|
||||||
# 每 BATCH_SIZE 条数据刷新一次文件
|
# 每 BATCH_SIZE 条数据刷新一次文件
|
||||||
@ -270,15 +302,10 @@ def process_one(href):
|
|||||||
logging.warning(f'fetching {href} error. retrying...')
|
logging.warning(f'fetching {href} error. retrying...')
|
||||||
continue # 获取失败,跳过
|
continue # 获取失败,跳过
|
||||||
|
|
||||||
id = extract_id_from_href(href)
|
if movie:
|
||||||
filename = f"./log/{id}.json" # 用 - 替换空格
|
write_movie_json(href, movie)
|
||||||
|
|
||||||
try:
|
print(f'fetch succ. saved result in {movies_dir}')
|
||||||
with open(filename, 'w', encoding='utf-8') as json_file:
|
|
||||||
json.dump(movie, json_file, indent=4, ensure_ascii=False)
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error writing file {filename}: {e}")
|
|
||||||
print(f'fetch succ. saved result in {filename}')
|
|
||||||
|
|
||||||
# 处理程序被终止时的数据
|
# 处理程序被终止时的数据
|
||||||
def handle_exit_signal(signal, frame):
|
def handle_exit_signal(signal, frame):
|
||||||
|
|||||||
Reference in New Issue
Block a user