add some scripts.

2025-02-26 10:37:16 +08:00
parent 351da8b3bf
commit dbd5ec28da
9 changed files with 960290 additions and 869185 deletions
--- a/scripts/iafd/detail_fetch.py
+++ b/scripts/iafd/detail_fetch.py
@ -31,6 +31,8 @@ import csv
 import logging
 import signal
 import sys
+import os
+import re
 from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 import config
@ -43,17 +45,19 @@ res_dir = './result'
 res_json_file = f'{res_dir}/detail.json'
 res_csv_file = f'{res_dir}/detail.csv'
 input_json_file = f'{res_dir}/merged.json'
+performers_dir = f'{res_dir}/performers'

 # 存储结果
 final_data = []

-# 读取 detail.json 中的 href
+# 读取 detail.json 中的 数据，以便于断点续传
 def load_existing_hrefs():
    existing_hrefs = set()
+    global final_data
    try:
        with open(res_json_file, 'r') as file:
-            data = json.load(file)
-            for entry in data:
+            final_data = json.load(file)
+            for entry in final_data:
                existing_hrefs.add(entry['href'])
    except FileNotFoundError:
        logging.info("detail.json not found, starting fresh.")
@ -65,7 +69,7 @@ def fetch_and_parse_page(url, scraper):
        response = scraper.get(url)
        if response.status_code != 200:
            logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
-            return None
+            return None, None
        
        # 解析 HTML 内容
        soup = BeautifulSoup(response.text, 'html.parser')
@ -92,6 +96,47 @@ def fetch_and_parse_page(url, scraper):
        }
        reversed_map = {v: k for k, v in fields.items()}

+        # 解析表格数据
+        movies = []
+        vixen_cnt = 0
+        blacked_cnt = 0
+        tushy_cnt = 0
+        x_art_cnt = 0
+        table = soup.find('table', id='personal')
+        if table:
+            rows = table.find_all('tr', class_='we')
+            for row in rows:
+                cols = row.find_all('td')
+                if len(cols) >= 6:
+                    title = cols[0].text.strip()
+                    year = cols[1].text.strip()
+                    distributor = cols[2].text.strip().lower()
+                    notes = cols[3].text.strip()
+                    rev = cols[4].text.strip()
+                    formats = cols[5].text.strip()
+
+                    # 统计 distributor 中的关键词
+                    if 'vixen' in distributor:
+                        vixen_cnt += 1
+                    if 'blacked' in distributor:
+                        blacked_cnt += 1
+                    if 'tushy' in distributor:
+                        tushy_cnt += 1
+                    if 'x_art' in distributor:
+                        x_art_cnt += 1
+
+                    movies.append({
+                        'title': title,
+                        'year': year,
+                        'distributor': distributor,
+                        'notes': notes,
+                        'rev': rev,
+                        'formats': formats
+                    })
+        else:
+            logging.warning(f"movie table empty. ")
+
+
        # 遍历每个 bioheading
        bioheadings = soup.find_all('p', class_='bioheading')
        for bio in bioheadings:
@ -112,11 +157,18 @@ def fetch_and_parse_page(url, scraper):
            if heading in reversed_map:
                kkey = reversed_map[heading]
                data[kkey] = biodata
+                
+        # 添加统计数据到 data
+        data['movies_cnt'] = len(movies)
+        data['vixen_cnt'] = vixen_cnt
+        data['blacked_cnt'] = blacked_cnt
+        data['tushy_cnt'] = tushy_cnt
+        data['x_art_cnt'] = x_art_cnt

-        return data
+        return data, movies
    except RequestException as e:
        logging.error(f"Error fetching {url}: {e}")
-        return None
+        return None, None

 # 写入 detail.json
 def write_to_detail_json(data):
@ -128,13 +180,24 @@ def write_to_csv(data):
    try:
        with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
-            header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings']
+            header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity', 
+                      'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings', 
+                      'movies_cnt', 'vixen_cnt', 'blacked_cnt', 'tushy_cnt', 'x_art_cnt']
            writer.writerow(header)
            for entry in data:
+                # 确保 performer_aka 始终为列表类型
+                performer_aka = entry.get('performer_aka', [])
+                
+                # 如果是 None 或非列表类型，转换为一个空列表
+                if performer_aka is None:
+                    performer_aka = []
+                elif not isinstance(performer_aka, list):
+                    performer_aka = [performer_aka]
+                    
                writer.writerow([
                    entry.get('person', ''),
                    entry.get('href', ''),
-                    '|'.join(entry.get('performer_aka', [])),
+                    '|'.join(performer_aka),
                    entry.get('birthday', ''),
                    entry.get('astrology', ''),
                    entry.get('birthplace', ''),
@ -148,7 +211,12 @@ def write_to_csv(data):
                    entry.get('weight', ''),
                    entry.get('measurements', ''),
                    entry.get('tattoos', ''),
-                    entry.get('piercings', '')
+                    entry.get('piercings', ''),
+                    entry.get('movies_cnt', 0),
+                    entry.get('vixen_cnt', 0),
+                    entry.get('blacked_cnt', 0),
+                    entry.get('tushy_cnt', 0),
+                    entry.get('x_art_cnt', 0)
                ])
    except Exception as e:
        logging.error(f"Error writing to CSV: {e}")
@ -159,21 +227,49 @@ def handle_exit_signal(signal, frame):
    write_to_detail_json(final_data)
    sys.exit(0)

+# 创建目录
+def create_directory_for_person(person):
+    # 获取 person 的前两个字母并转为小写
+    person_dir = person[:1].lower()
+    full_path = os.path.join(performers_dir, person_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_person_json(person, href, data):
+    # 获取目录
+    person_dir = create_directory_for_person(person)
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+
 def main():
    # 初始化 cloudscraper
    scraper = cloudscraper.create_scraper()

    # 加载已存在的 href 列表
+    global final_data
    existing_hrefs = load_existing_hrefs()
+    logging.info(f"load data from {res_json_file}, count: {len(final_data)}")

    # 读取 merged.json
    with open(input_json_file, 'r') as file:
        merged_data = json.load(file)

-    # 注册退出信号
-    signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
-    signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
-
    # 遍历 merged.json 中的数据
    loop = 0
    for entry in merged_data:
@ -187,30 +283,51 @@ def main():
        logging.info(f"Processing {href} - {person}")

        # 获取并解析数据
-        data = fetch_and_parse_page(href, scraper)
-        if data:
-            # 如果数据正确,加入到 final_data
-            final_data.append({
-                'href': href,
-                'person': person,
-                **data
-            })
-            loop = loop+1
-            if loop % 100 == 0:
-                # 更新 detail.json 文件
-                print(f'flush data to json file. now data count: {loop}')
-                write_to_detail_json(final_data)
-            
-            # 更新已存在的 href
-            existing_hrefs.add(href)
+        while True:
+            data, movies = fetch_and_parse_page(href, scraper)
+            if data is None:
+                logging.warning(f'Retring {href} - {person} ')
+                time.sleep(3)
+            else:
+                break
+
+        # 如果数据正确,加入到 final_data
+        final_data.append({
+            'href': href,
+            'person': person,
+            **data
+        })
+
+        # 写入 performer 的独立 JSON 文件
+        full_data = {
+            'href': href,
+            'person': person,
+            **data,
+            'movies': movies if movies else []
+        }
+        write_person_json(person.strip(), href, full_data)
+
+        # 更新 detail.json 文件
+        loop = loop + 1
+        if loop % 100 == 0:
+            logging.info(f'flush data to json file. now fetched data count: {loop}, total count: {len(final_data)}')
+            write_to_detail_json(final_data)
+            write_to_csv(final_data)
+        
+        # 更新已存在的 href
+        existing_hrefs.add(href)
        
        # 延时,防止请求过快被封锁
        time.sleep(1)

-    # 完成后一次性写入 CSV
-    write_to_csv(final_data)
-
-    logging.info("Data processing completed.")
-
 if __name__ == "__main__":
-    main()
+    try:
+        # 注册退出信号
+        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
+        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
+        main()
+    finally:
+        # 清理操作，保证在程序正常退出时执行
+        write_to_csv(final_data)  # Write to CSV or other necessary tasks
+        write_to_detail_json(final_data)  # Save data to JSON
+        logging.info("Data processing completed.")