add some scripts.

This commit is contained in:
2025-02-12 15:33:33 +08:00
parent 62a2fbdc77
commit cad5aa11d6
17 changed files with 247927 additions and 4 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,64 @@
import os
import json
import csv
from bs4 import BeautifulSoup
INPUT_DIR = "html"
OUTPUT_JSON = "./result/iafd_meta.json"
OUTPUT_CSV = "./result/iafd_meta.csv"
BASE_URL = "https://www.iafd.com"
def parse_html_file(filepath):
"""解析单个 HTML 文件,提取需要的信息。"""
person_list = []
filename = os.path.basename(filepath)
filename = os.path.splitext(filename)[0]
with open(filepath, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
astro_div = soup.find("div", id="astro")
if not astro_div:
print(f"Warning: No 'astro' div found in {filename}")
return []
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = BASE_URL + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
person_list.append({
"astrology": filename,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
return person_list
def main():
all_persons = []
for filename in os.listdir(INPUT_DIR):
if filename.endswith(".html"):
filepath = os.path.join(INPUT_DIR, filename)
print(f"正在解析 {filename} ...")
all_persons.extend(parse_html_file(filepath))
# 保存 JSON
with open(OUTPUT_JSON, "w", encoding="utf-8") as json_file:
json.dump(all_persons, json_file, indent=4, ensure_ascii=False)
# 保存 CSV
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=["astrology", "birth_date", "person", "href"])
writer.writeheader()
writer.writerows(all_persons)
print(f"Data extracted and saved to {OUTPUT_JSON} and {OUTPUT_CSV}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -31,8 +31,8 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
# 目标文件路径
INPUT_FILE = "result/models.json"
OUTPUT_JSON = "result/models_detail.json"
OUTPUT_CSV = "result/models_detail.csv"
OUTPUT_JSON = "result/javhd_models.json"
OUTPUT_CSV = "result/javhd_models.csv"
HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",

View File

@ -28,8 +28,8 @@ from bs4 import BeautifulSoup
# 文件路径
DIR_RES = './result'
ACTRESSES_FILE = f"{DIR_RES}/actresses.json"
DETAILS_JSON_FILE = f"{DIR_RES}/actress_detail.json"
DETAILS_CSV_FILE = f"{DIR_RES}/actress_detail.csv"
DETAILS_JSON_FILE = f"{DIR_RES}/thelordofporn_pornstars.json"
DETAILS_CSV_FILE = f"{DIR_RES}/thelordofporn_pornstars.csv"
# 请求头和 Cookies模拟真实浏览器
HEADERS = {