add some scripts.

2025-02-28 11:34:26 +08:00
parent f482a3353f
commit 3c14ce8cf2
6 changed files with 6644 additions and 2117 deletions
--- a/scripts/iafd/detail_fetch.py
+++ b/scripts/iafd/detail_fetch.py
@ -63,6 +63,46 @@ def load_existing_hrefs():
        logging.info("detail.json not found, starting fresh.")
    return existing_hrefs

+# 解析 作品列表，有个人出演，也有导演的
+def parse_credits_table(table, distributor_list):
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    movies = []
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+
+    # rows = table.find_all('tr', class_='we')
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 6:
+            title = cols[0].text.strip()
+            year = cols[1].text.strip()
+            distributor = cols[2].text.strip().lower()
+            notes = cols[3].text.strip()
+            rev = cols[4].text.strip()
+            formats = cols[5].text.strip()
+
+            for key in distributor_list:
+                if key in distributor:
+                    distributor_count[key] += 1
+
+            movies.append({
+                'title': title,
+                'year': year,
+                'distributor': distributor,
+                'notes': notes,
+                'rev': rev,
+                'formats': formats
+            })
+    return movies, distributor_count
+
+
 # 请求网页并提取所需数据
 def fetch_and_parse_page(url, scraper):
    try:
@ -96,61 +136,31 @@ def fetch_and_parse_page(url, scraper):
        }
        reversed_map = {v: k for k, v in fields.items()}

-        # 解析表格数据
-        movies = []
-        vixen_cnt = 0
-        blacked_cnt = 0
-        tushy_cnt = 0
-        x_art_cnt = 0
-        role = 'personal'
-        table = soup.find('table', id='personal')
-        if table is None:
-            table = soup.find('table', id='directoral')
-            role = 'directoral'
-        if table:
-            # 找到thead并跳过
-            thead = table.find('thead')
-            if thead:
-                thead.decompose()  # 去掉thead部分，不需要解析
+        # 解析表格数据, 获取参演或者导演的列表
+        role_list = ['personal', 'directoral']
+        distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
+        credits_list = {}

-            # 现在只剩下tbody部分
-            tbody = table.find('tbody')
-            rows = tbody.find_all('tr') if tbody else []
+        # 使用字典来存储统计
+        distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+        for role in role_list:
+            table = soup.find('table', id=role)
+            if table :
+                movies, stat_map = parse_credits_table(table, distributor_list)
+                credits_list[role] = movies
+                # 更新 distributor 统计
+                for distributor in distributor_list:
+                    distributor_count[distributor] += stat_map.get(distributor, 0)

-            # rows = table.find_all('tr', class_='we')
-            for row in rows:
-                cols = row.find_all('td')
-                if len(cols) >= 6:
-                    title = cols[0].text.strip()
-                    year = cols[1].text.strip()
-                    distributor = cols[2].text.strip().lower()
-                    notes = cols[3].text.strip()
-                    rev = cols[4].text.strip()
-                    formats = cols[5].text.strip()
+        # 统计 movies 数量
+        #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+        movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))

-                    # 统计 distributor 中的关键词
-                    if 'vixen' in distributor:
-                        vixen_cnt += 1
-                    if 'blacked' in distributor:
-                        blacked_cnt += 1
-                    if 'tushy' in distributor:
-                        tushy_cnt += 1
-                    if 'x_art' in distributor:
-                        x_art_cnt += 1
+        # 如果没有找到
+        if len(credits_list) == 0 :
+            logging.warning(f"movie table empty. url: {url} ")

-                    movies.append({
-                        'title': title,
-                        'year': year,
-                        'distributor': distributor,
-                        'notes': notes,
-                        'rev': rev,
-                        'formats': formats
-                    })
-        else:
-            logging.warning(f"movie table empty. ")
-
-
-        # 遍历每个 bioheading
+        # 遍历每个 bioheading, 获取metadata
        bioheadings = soup.find_all('p', class_='bioheading')
        for bio in bioheadings:
            heading = bio.text.strip()
@ -172,13 +182,13 @@ def fetch_and_parse_page(url, scraper):
                data[kkey] = biodata
                
        # 添加统计数据到 data
-        data['movies_cnt'] = len(movies)
-        data['vixen_cnt'] = vixen_cnt
-        data['blacked_cnt'] = blacked_cnt
-        data['tushy_cnt'] = tushy_cnt
-        data['x_art_cnt'] = x_art_cnt
+        data['movies_cnt'] = movies_cnt
+        data['vixen_cnt'] = distributor_count['vixen']
+        data['blacked_cnt'] = distributor_count['blacked']
+        data['tushy_cnt'] = distributor_count['tushy']
+        data['x_art_cnt'] = distributor_count['x-art']

-        return data, {'role': role, 'movies' : movies}
+        return data, credits_list
    except RequestException as e:
        logging.error(f"Error fetching {url}: {e}")
        return None, None
@ -270,7 +280,35 @@ def write_person_json(person, href, data):
        logging.error(f"Error writing file {full_path}: {e}")


-def main():
+# 指定url访问
+def process_one(href):
+    # 初始化 cloudscraper
+    scraper = cloudscraper.create_scraper()
+    # 获取并解析数据
+    while True:
+        data, movies = fetch_and_parse_page(href, scraper)
+        if data is None:
+            logging.warning(f'Retring {href} ')
+            time.sleep(3)
+        else:
+            break
+
+    # 写入 performer 的独立 JSON 文件
+    full_data = {
+        **data,
+        'credits': movies if movies else {}
+    }
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person_id}.json"  # 用 - 替换空格
+
+    try:
+        with open(person_filename, 'w', encoding='utf-8') as json_file:
+            json.dump(full_data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {person_filename}: {e}")
+    print(f'fetch succ. saved result in {person_filename}')
+
+def process_all():
    # 初始化 cloudscraper
    scraper = cloudscraper.create_scraper()

@ -297,7 +335,7 @@ def main():

        # 获取并解析数据
        while True:
-            data, movies = fetch_and_parse_page(href, scraper)
+            data, credits = fetch_and_parse_page(href, scraper)
            if data is None:
                logging.warning(f'Retring {href} - {person} ')
                time.sleep(3)
@ -316,7 +354,7 @@ def main():
            'href': href,
            'person': person,
            **data,
-            'credits': movies if movies else {}
+            'credits': credits if credits else {}
        }
        write_person_json(person.strip(), href, full_data)

@ -333,14 +371,23 @@ def main():
        # 延时,防止请求过快被封锁
        time.sleep(1)

-if __name__ == "__main__":
+# 全量访问
+def main():
    try:
        # 注册退出信号
        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
-        main()
+        process_all()
    finally:
        # 清理操作，保证在程序正常退出时执行
        write_to_csv(final_data)  # Write to CSV or other necessary tasks
        write_to_detail_json(final_data)  # Save data to JSON
-        logging.info("Data processing completed.")
+        logging.info("Data processing completed.")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        url = sys.argv[1]
+        process_one(url)
+    else:
+        main()