From 8fd48687fced44b33b21a888fb451d5e806381eb Mon Sep 17 00:00:00 2001 From: oscar Date: Mon, 3 Mar 2025 09:02:59 +0800 Subject: [PATCH] modify iafd scripts. --- scripts/iafd/distributors_list_fetch.py | 192 ----- scripts/iafd/html/01-aries.html | 357 -------- scripts/iafd/html/02-taurus.html | 357 -------- scripts/iafd/html/03-gemini.html | 357 -------- scripts/iafd/html/04-cancer.html | 357 -------- scripts/iafd/html/05-leo.html | 357 -------- scripts/iafd/html/06-virgo.html | 357 -------- scripts/iafd/html/07-libra.html | 357 -------- scripts/iafd/html/08-scorpio.html | 357 -------- scripts/iafd/html/09-sagittarius.html | 357 -------- scripts/iafd/html/10-capricorn.html | 357 -------- scripts/iafd/html/11-aquarius.html | 357 -------- scripts/iafd/html/12-pisces.html | 357 -------- scripts/iafd/html_format.py | 90 -- scripts/iafd/result/movie_details.csv | 20 + scripts/iafd/result/movie_details.json | 809 ++++++++++++++++++ scripts/iafd/result/movie_list.json | 1 + scripts/iafd/{ => src_json}/config.py | 11 +- .../movie_detail_fetch.py} | 105 ++- .../movie_list_fetch.py} | 116 ++- .../performers_details.py} | 2 +- .../performers_list_astro.py} | 2 +- .../performers_list_birth.py} | 2 +- .../performers_list_ethnic.py} | 2 +- .../performers_list_merge.py} | 15 +- scripts/iafd/{ => tools}/data_merge.py | 0 scripts/iafd/{ => tools}/iafd_scrape.py | 0 scripts/iafd/{ => tools}/stashdb_merge.py | 0 28 files changed, 1015 insertions(+), 4636 deletions(-) delete mode 100644 scripts/iafd/distributors_list_fetch.py delete mode 100644 scripts/iafd/html/01-aries.html delete mode 100644 scripts/iafd/html/02-taurus.html delete mode 100644 scripts/iafd/html/03-gemini.html delete mode 100644 scripts/iafd/html/04-cancer.html delete mode 100644 scripts/iafd/html/05-leo.html delete mode 100644 scripts/iafd/html/06-virgo.html delete mode 100644 scripts/iafd/html/07-libra.html delete mode 100644 scripts/iafd/html/08-scorpio.html delete mode 100644 scripts/iafd/html/09-sagittarius.html delete mode 100644 scripts/iafd/html/10-capricorn.html delete mode 100644 scripts/iafd/html/11-aquarius.html delete mode 100644 scripts/iafd/html/12-pisces.html delete mode 100644 scripts/iafd/html_format.py create mode 100644 scripts/iafd/result/movie_details.csv create mode 100644 scripts/iafd/result/movie_details.json create mode 120000 scripts/iafd/result/movie_list.json rename scripts/iafd/{ => src_json}/config.py (82%) rename scripts/iafd/{movie_meta_fetch.py => src_json/movie_detail_fetch.py} (67%) rename scripts/iafd/{studios_list_fetch.py => src_json/movie_list_fetch.py} (63%) rename scripts/iafd/{detail_fetch.py => src_json/performers_details.py} (99%) rename scripts/iafd/{list_fetch_astro.py => src_json/performers_list_astro.py} (99%) rename scripts/iafd/{list_fetch_birth.py => src_json/performers_list_birth.py} (99%) rename scripts/iafd/{list_fetch_ethnic.py => src_json/performers_list_ethnic.py} (99%) rename scripts/iafd/{list_merge.py => src_json/performers_list_merge.py} (92%) rename scripts/iafd/{ => tools}/data_merge.py (100%) rename scripts/iafd/{ => tools}/iafd_scrape.py (100%) rename scripts/iafd/{ => tools}/stashdb_merge.py (100%) diff --git a/scripts/iafd/distributors_list_fetch.py b/scripts/iafd/distributors_list_fetch.py deleted file mode 100644 index d58b70a..0000000 --- a/scripts/iafd/distributors_list_fetch.py +++ /dev/null @@ -1,192 +0,0 @@ -""" -Script Name: -Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare - detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。 - list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全 - list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全 - list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的 - list_merge.py 上面三个列表的数据,取交集,得到整体数据。 - iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配) - - html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。 - data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并; - stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并 - 从而获取到一份完整的数据列表。 - -Author: [Your Name] -Created Date: YYYY-MM-DD -Last Modified: YYYY-MM-DD -Version: 1.0 - -Modification History: - - YYYY-MM-DD [Your Name]: - - YYYY-MM-DD [Your Name]: - - YYYY-MM-DD [Your Name]: -""" - -import cloudscraper -import json -import time -import csv -from bs4 import BeautifulSoup -import logging -import config - -config.setup_logging() - -# 定义基础 URL 和可变参数 -host_url = "https://www.iafd.com" -base_url = f"{host_url}/distrib.rme/distrib=" -dist_list_url = f'{base_url}/distrib.asp' - -distr_map = { - 6812 : 'nubilefilms.com', - 8563 : 'teenmegaworld network', - 6779 : 'x-art.com', - 7133 : 'tushy.com', - 6496 : 'blacked.com', - 7758 : 'vixen.com', - 6791 : 'teamskeet.com', - 12454: 'vip4k.com', - 13541: 'wow network', - 9702 : 'cum4k.com', - 6778 : 'tiny4k.com', - 12667: 'anal4k.com', - 7419 : 'exotic4k.com', - 13594: 'facials4k.com', - 13633: 'mom4k.com', - 12335: 'slim4k.com', - 16709: 'strippers4k.com', - -} - -# 设置 headers 和 scraper -headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' -} -scraper = cloudscraper.create_scraper() - -# 结果路径 -res_dir = './result' -all_data = [] - -# 网络请求并解析 HTML -def fetch_page(url): - try: - response = scraper.get(url, headers=headers) - response.raise_for_status() - return response.text - except Exception as e: - logging.error(f"Failed to fetch {url}: {e}") - return None - -# 解析 HTML 内容,提取需要的数据 -def parse_page(html, name): - soup = BeautifulSoup(html, "html.parser") - table = soup.find("table", id="distable") - - if not table: - logging.warning(f"Warning: No 'distable' table found in {name}") - return None - - # 找到thead并跳过 - thead = table.find('thead') - if thead: - thead.decompose() # 去掉thead部分,不需要解析 - - # 现在只剩下tbody部分 - tbody = table.find('tbody') - rows = tbody.find_all('tr') if tbody else [] - - global all_data - for row in rows: - cols = row.find_all('td') - if len(cols) >= 5: - title = cols[0].text.strip() - label = cols[1].text.strip() - year = cols[2].text.strip() - rev = cols[3].text.strip() - a_href = cols[0].find('a') - href = host_url + a_href['href'] if a_href else '' - - all_data.append({ - 'distributors': name, - 'title': title, - 'label': label, - 'year': year, - 'rev': rev, - 'href': href - }) - return soup - -# 处理翻页,星座的无需翻页 -def handle_pagination(soup, astro): - return None - -# 获取列表页 -def process_list_gage(): - global distr_map - - logging.info(f"Fetching data for {dist_list_url} ...") - select_element = None - while True: - html = fetch_page(dist_list_url) - if html: - soup = BeautifulSoup(html, "html.parser") - select_element = soup.find('select', {'name': 'Distrib'}) - if select_element : - break - else: - logging.info(f"wrong html content. retring {dist_list_url} ...") - else: - logging.info(f"wrong html content. retring {dist_list_url} ...") - - if not select_element: - return None - - options = select_element.find_all('option') - for option in options: - value = option.get('value') # 获取 value 属性 - text = option.text.strip() # 获取文本内容 - distr_map[int(value)] = text - logging.info(f'fetch {dist_list_url} succ. total distributors: {len(distr_map)}') - return True - -# 主逻辑函数:循环处理每个种族 -def process_main_data(): - for dis_key, dis_name in distr_map.items(): - url = base_url + str(dis_key) - next_url = url - logging.info(f"Fetching data for {dis_name}, url {url} ...") - - while next_url: - html = fetch_page(next_url) - if html: - soup = parse_page(html, dis_name) - if soup: - next_url = handle_pagination(soup, dis_name) - else: - logging.info(f"wrong html content. retring {next_url} ...") - # 定期保存结果 - save_data() - time.sleep(2) # 控制访问频率 - else: - logging.info(f"Retrying {next_url} ...") - time.sleep(5) # 等待后再重试 - -# 保存到文件 -def save_data(): - with open(f'{res_dir}/distributors.json', 'w', encoding='utf-8') as json_file: - json.dump(all_data, json_file, indent=4, ensure_ascii=False) - - with open(f'{res_dir}/distributors.csv', 'w', newline='', encoding='utf-8') as csv_file: - writer = csv.DictWriter(csv_file, fieldnames=['distributors', 'title', 'label', 'year', 'rev', 'href']) - writer.writeheader() - writer.writerows(all_data) - -# 执行主逻辑 -if __name__ == '__main__': - #process_list_gage() - process_main_data() - save_data() - logging.info("Data fetching and saving completed.") \ No newline at end of file diff --git a/scripts/iafd/html/01-aries.html b/scripts/iafd/html/01-aries.html deleted file mode 100644 index 168176b..0000000 --- a/scripts/iafd/html/01-aries.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -aries - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - aries

- - -

March 21

March 22

March 23

March 24

March 25

March 26

March 27

March 28

March 29

March 30

March 31

April 01

April 02

April 03

April 04

April 05

April 06

April 07

April 08

April 09

April 10

April 11

April 12

April 13

April 14

April 15

April 16

April 17

April 18

April 19

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/02-taurus.html b/scripts/iafd/html/02-taurus.html deleted file mode 100644 index 7b49cdc..0000000 --- a/scripts/iafd/html/02-taurus.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -taurus - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - taurus

- - -

April 20

April 21

April 22

April 23

April 24

April 25

April 26

April 27

April 28

April 29

April 30

May 01

May 02

May 03

May 04

May 05

May 06

May 07

May 08

May 09

May 10

May 11

May 12

May 13

May 14

May 15

May 16

May 17

May 18

May 19

May 20

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/03-gemini.html b/scripts/iafd/html/03-gemini.html deleted file mode 100644 index 5051dc8..0000000 --- a/scripts/iafd/html/03-gemini.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -gemini - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - gemini

- - -

May 21

May 22

May 23

May 24

May 25

May 26

May 27

May 28

May 29

May 30

May 31

June 01

June 02

June 03

June 04

June 05

June 06

June 07

June 08

June 09

June 10

June 11

June 12

June 13

June 14

June 15

June 16

June 17

June 18

June 19

June 20

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/04-cancer.html b/scripts/iafd/html/04-cancer.html deleted file mode 100644 index edadc6c..0000000 --- a/scripts/iafd/html/04-cancer.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -cancer - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - cancer

- - -

June 21

June 22

June 23

June 24

June 25

June 26

June 27

June 28

June 29

June 30

July 01

July 02

July 03

July 04

July 05

July 06

July 07

July 08

July 09

July 10

July 11

July 12

July 13

July 14

July 15

July 16

July 17

July 18

July 19

July 20

July 21

July 22

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/05-leo.html b/scripts/iafd/html/05-leo.html deleted file mode 100644 index 1eff18f..0000000 --- a/scripts/iafd/html/05-leo.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -leo - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - leo

- - -

July 23

July 24

July 25

July 26

July 27

July 28

July 29

July 30

July 31

August 01

August 02

August 03

August 04

August 05

August 06

August 07

August 08

August 09

August 10

August 11

August 12

August 13

August 14

August 15

August 16

August 17

August 18

August 19

August 20

August 21

August 22

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/06-virgo.html b/scripts/iafd/html/06-virgo.html deleted file mode 100644 index c665410..0000000 --- a/scripts/iafd/html/06-virgo.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -virgo - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - virgo

- - -

August 23

August 24

August 25

August 26

August 27

August 28

August 29

August 30

August 31

September 01

September 02

September 03

September 04

September 05

September 06

September 07

September 08

September 09

September 10

September 11

September 12

September 13

September 14

September 15

September 16

September 17

September 18

September 19

September 20

September 21

September 22

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/07-libra.html b/scripts/iafd/html/07-libra.html deleted file mode 100644 index d5ca2c8..0000000 --- a/scripts/iafd/html/07-libra.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -libra - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - libra

- - -

September 23

September 24

September 25

September 26

September 27

September 28

September 29

September 30

October 01

October 02

October 03

October 04

October 05

October 06

October 07

October 08

October 09

October 10

October 11

October 12

October 13

October 14

October 15

October 16

October 17

October 18

October 19

October 20

October 21

October 22

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/08-scorpio.html b/scripts/iafd/html/08-scorpio.html deleted file mode 100644 index b9f2b24..0000000 --- a/scripts/iafd/html/08-scorpio.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -scorpio - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - scorpio

- - -

October 23

October 24

October 25

October 26

October 27

October 28

October 29

October 30

October 31

November 01

November 02

November 03

November 04

November 05

November 06

November 07

November 08

November 09

November 10

November 11

November 12

November 13

November 14

November 15

November 16

November 17

November 18

November 19

November 20

November 21

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/09-sagittarius.html b/scripts/iafd/html/09-sagittarius.html deleted file mode 100644 index 3e8e25e..0000000 --- a/scripts/iafd/html/09-sagittarius.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -sagittarius - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - sagittarius

- - -

November 22

November 23

November 24

November 25

November 26

November 27

November 28

November 29

November 30

December 01

December 02

December 03

December 04

December 05

December 06

December 07

December 08

December 09

December 10

December 11

December 12

December 13

December 14

December 15

December 16

December 17

December 18

December 19

December 20

December 21

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/10-capricorn.html b/scripts/iafd/html/10-capricorn.html deleted file mode 100644 index fa7cd4e..0000000 --- a/scripts/iafd/html/10-capricorn.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -capricorn - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - capricorn

- - -

December 22

December 23

December 24

December 25

December 26

December 27

December 28

December 29

December 30

December 31

January 01

January 02

January 03

January 04

January 05

January 06

January 07

January 08

January 09

January 10

January 11

January 12

January 13

January 14

January 15

January 16

January 17

January 18

January 19

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/11-aquarius.html b/scripts/iafd/html/11-aquarius.html deleted file mode 100644 index 3af889b..0000000 --- a/scripts/iafd/html/11-aquarius.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -aquarius - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - aquarius

- - -

January 20

January 21

January 22

January 23

January 24

January 25

January 26

January 27

January 28

January 29

January 30

January 31

February 01

February 02

February 03

February 04

February 05

February 06

February 07

February 08

February 09

February 10

February 11

February 12

February 13

February 14

February 15

February 16

February 17

February 18

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html/12-pisces.html b/scripts/iafd/html/12-pisces.html deleted file mode 100644 index 86e6abd..0000000 --- a/scripts/iafd/html/12-pisces.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -pisces - iafd.com - internet adult film database - - - - - - - - -
-
-
-

astrology - pisces

- - -

February 19

February 20

February 21

February 22

February 23

February 24

February 25

February 26

February 27

February 28

February 29

March 01

March 02

March 03

March 04

March 05

March 06

March 07

March 08

March 09

March 10

March 11

March 12

March 13

March 14

March 15

March 16

March 17

March 18

March 19

March 20

- -

Pick a symbol:

- - -

Zodiac Images: Salvatore Vuono / FreeDigitalPhotos.net

- -
-
-
- - - - - - - -
- -
- -
- -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/iafd/html_format.py b/scripts/iafd/html_format.py deleted file mode 100644 index 1ec0a38..0000000 --- a/scripts/iafd/html_format.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Script Name: -Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare - detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。 - list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全 - list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全 - list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的 - list_merge.py 上面三个列表的数据,取交集,得到整体数据。 - iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配) - - html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。 - data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并; - stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并 - 从而获取到一份完整的数据列表。 - -Author: [Your Name] -Created Date: YYYY-MM-DD -Last Modified: YYYY-MM-DD -Version: 1.0 - -Modification History: - - YYYY-MM-DD [Your Name]: - - YYYY-MM-DD [Your Name]: - - YYYY-MM-DD [Your Name]: -""" - -import os -import json -import csv -from bs4 import BeautifulSoup - -INPUT_DIR = "html" -OUTPUT_JSON = "./result/iafd_meta.json" -OUTPUT_CSV = "./result/iafd_meta.csv" -BASE_URL = "https://www.iafd.com" - -def parse_html_file(filepath): - """解析单个 HTML 文件,提取需要的信息。""" - person_list = [] - filename = os.path.basename(filepath) - filename = os.path.splitext(filename)[0] - - with open(filepath, "r", encoding="utf-8") as file: - soup = BeautifulSoup(file, "html.parser") - astro_div = soup.find("div", id="astro") - - if not astro_div: - print(f"Warning: No 'astro' div found in {filename}") - return [] - - birth_date = None - for elem in astro_div.find_all(recursive=False): - if elem.name == "h3" and "astroday" in elem.get("class", []): - birth_date = elem.get_text(strip=True) - elif elem.name == "div" and "perficon" in elem.get("class", []): - a_tag = elem.find("a") - if a_tag: - href = BASE_URL + a_tag["href"] - name = a_tag.find("span", class_="perfname") - if name: - person_list.append({ - "astrology": filename, - "birth_date": birth_date, - "person": name.get_text(strip=True), - "href": href - }) - return person_list - -def main(): - all_persons = [] - for filename in os.listdir(INPUT_DIR): - if filename.endswith(".html"): - filepath = os.path.join(INPUT_DIR, filename) - print(f"正在解析 {filename} ...") - all_persons.extend(parse_html_file(filepath)) - - # 保存 JSON - with open(OUTPUT_JSON, "w", encoding="utf-8") as json_file: - json.dump(all_persons, json_file, indent=4, ensure_ascii=False) - - # 保存 CSV - with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csv_file: - writer = csv.DictWriter(csv_file, fieldnames=["astrology", "birth_date", "person", "href"]) - writer.writeheader() - writer.writerows(all_persons) - - print(f"Data extracted and saved to {OUTPUT_JSON} and {OUTPUT_CSV}") - -if __name__ == "__main__": - main() diff --git a/scripts/iafd/result/movie_details.csv b/scripts/iafd/result/movie_details.csv new file mode 100644 index 0000000..dca6bd7 --- /dev/null +++ b/scripts/iafd/result/movie_details.csv @@ -0,0 +1,20 @@ +href,title,Minutes,Distributor,Studio,ReleaseDate,AddedtoIAFDDate,All-Girl,All-Male,Compilation,Webscene,Director +https://www.iafd.com/title.rme/id=aefba072-0133-4b0e-8a88-e2d5ea24ff06,About Last Night,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Mar 15, 2022","Mar 15, 2022",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=d6034e14-b6e0-4a4c-9800-46f00bdde130,Absolute Beauty,No Data,nubilefilms.com,nubilefilms.com,"Sep 28, 2016","Aug 2, 2021",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=8014b300-3307-4d36-828e-b29963742a65,Ace In The Sheets,No Data,nubilefilms.com,nubilefilms.com,"Jun 08, 2020","Jun 9, 2020",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=62990006-e227-4596-852c-d38b3a0923f8,Addicted to Love,No Data,nubilefilms.com,nubilefilms.com,"Mar 25, 2013","Aug 2, 2021",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=cd5a3966-226c-4d72-b7a0-f838104834ec,Adorable (II),No Data,nubilefilms.com,GirlsOnlyPorn.com,"Dec 19, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=9740af6b-1633-481a-8ec0-5d7bade7b404,After Class,No Data,nubilefilms.com,nubilefilms.com,"Jan 10, 2021","Aug 2, 2021",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=eeb5b776-5e42-40eb-b13b-34003c519111,After Party (III),No Data,nubilefilms.com,nubilefilms.com,"Jul 30, 2015","Aug 2, 2021",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=f4a624b8-43cb-43b6-80ef-1156cd0a1f09,After the Sunset,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Jan 05, 2016","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=189309aa-54ea-4081-820f-5783e08db8d7,Afterhours (II),No Data,nubilefilms.com,GirlsOnlyPorn.com,"Feb 27, 2013","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=17489ed9-4710-4672-b3fa-fd2e25a70750,Afternoon Delight,No Data,nubilefilms.com,nubilefilms.com,"Apr 13, 2015","Aug 2, 2021",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=ec88d89a-b538-432c-9d27-b67f4f6c826f,Afternoon Fantasy,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Aug 31, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=8f398cf1-1009-4997-a264-ef29c96ad608,Afternoon Lounge,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Aug 30, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=31707f4b-a82f-47cb-b714-83f9a3fa4403,Again In The Morning,No Data,nubilefilms.com,nfbusty.com,"Jul 05, 2022","Jul 6, 2022",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=e94c5738-fe9b-4905-b300-d22a553ee4ca,Agonizing Release,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Mar 13, 2013","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=f8d8379d-c5bb-4b43-ade4-96e937a08115,All About Tonight,36,nubilefilms.com,nubilefilms.com,"Feb 16, 2022","Feb 16, 2022",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=73bb5218-1023-49ba-8c94-3b9fb14fee94,All Alone,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Oct 28, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=0e1b42a0-ccd6-4469-967f-96a10072b68e,All Business,No Data,nubilefilms.com,nubilefilms.com,"Dec 06, 2015","Aug 2, 2021",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=b3658e59-afa4-45e6-b842-6e26f9b3a867,All For Love,No Data,nubilefilms.com,hotcrazymess.com,"Aug 02, 2019","Aug 2, 2019",No,No,No,Yes,No Data +https://www.iafd.com/title.rme/id=ddf2ef79-543d-4e89-b609-6b9e707dfac4,All For Lust,22,nubilefilms.com,nubilefilms.com,"Jan 03, 2019","Jan 4, 2019",No,No,No,Yes,No Data diff --git a/scripts/iafd/result/movie_details.json b/scripts/iafd/result/movie_details.json new file mode 100644 index 0000000..dbd114e --- /dev/null +++ b/scripts/iafd/result/movie_details.json @@ -0,0 +1,809 @@ +[ + { + "href": "https://www.iafd.com/title.rme/id=aefba072-0133-4b0e-8a88-e2d5ea24ff06", + "title": "About Last Night", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Mar 15, 2022", + "AddedtoIAFDDate": "Mar 15, 2022", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Alex Coal", + "href": "https://www.iafd.com/person.rme/id=a731a328-f69d-4ac5-a82f-81e010341895", + "tags": [ + "Alex Coal", + "LezOnly Bald" + ] + }, + { + "name": "Liz Jordan", + "href": "https://www.iafd.com/person.rme/id=d2b9fa26-3062-4b90-a1dd-10f3ef26fb68", + "tags": [ + "Liz Jordan", + "LezOnly Bald" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Alex Coal", + "Liz Jordan" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=d6034e14-b6e0-4a4c-9800-46f00bdde130", + "title": "Absolute Beauty", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Sep 28, 2016", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Aisha", + "href": "https://www.iafd.com/person.rme/id=20d3df80-22d1-4375-a8e5-7b276f25d8d3", + "tags": [ + "Aisha" + ] + }, + { + "name": "Nick Ross", + "href": "https://www.iafd.com/person.rme/id=0a566505-1e62-467a-ba79-3e45b5f540de", + "tags": [ + "Nick Ross" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Aisha", + "Nick Ross" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=8014b300-3307-4d36-828e-b29963742a65", + "title": "Ace In The Sheets", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Jun 08, 2020", + "AddedtoIAFDDate": "Jun 9, 2020", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Codey Steele", + "href": "https://www.iafd.com/person.rme/id=4666d258-5053-401b-bcc6-22737ff15a89", + "tags": [ + "Codey Steele" + ] + }, + { + "name": "Karla Kush", + "href": "https://www.iafd.com/person.rme/id=91e5e13c-e5de-4d0a-83c8-d00a19959ffd", + "tags": [ + "Karla Kush" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Karla Kush", + "Codey Steele" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=62990006-e227-4596-852c-d38b3a0923f8", + "title": "Addicted to Love", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Mar 25, 2013", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Logan Pierce", + "href": "https://www.iafd.com/person.rme/id=ec02e34e-1f30-489d-934b-e1fb22531a6e", + "tags": [ + "Logan Pierce" + ] + }, + { + "name": "Presley Hart", + "href": "https://www.iafd.com/person.rme/id=8497863b-5a34-4594-bd4d-8188ac909a05", + "tags": [ + "Presley Hart" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Presley Hart", + "Logan Pierce" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=cd5a3966-226c-4d72-b7a0-f838104834ec", + "title": "Adorable (II)", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Dec 19, 2012", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Beata Undine", + "href": "https://www.iafd.com/person.rme/id=46d59a10-988e-453b-9ed3-aafa1a0a3b5a", + "tags": [ + "Beata Undine", + "MastOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Beata Undine" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=9740af6b-1633-481a-8ec0-5d7bade7b404", + "title": "After Class", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Jan 10, 2021", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Alex Coal", + "href": "https://www.iafd.com/person.rme/id=a731a328-f69d-4ac5-a82f-81e010341895", + "tags": [ + "Alex Coal", + "Creampie" + ] + }, + { + "name": "Stirling Cooper", + "href": "https://www.iafd.com/person.rme/id=a983951b-87d5-4dd9-a5c7-597e0d9e59f4", + "tags": [ + "Stirling Cooper" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Alex Coal", + "Stirling Cooper" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=eeb5b776-5e42-40eb-b13b-34003c519111", + "title": "After Party (III)", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Jul 30, 2015", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Dakota Skye", + "href": "https://www.iafd.com/person.rme/id=24d4ce7c-777f-4303-b122-9167b15100a8", + "tags": [ + "Dakota Skye" + ] + }, + { + "name": "Samantha Hayes", + "href": "https://www.iafd.com/person.rme/id=9fc8d827-52bc-43d2-8eee-d57a7274745d", + "tags": [ + "Samantha Hayes" + ] + }, + { + "name": "Van Wylde", + "href": "https://www.iafd.com/person.rme/id=61980288-9c48-4875-b319-0cef87f26ee6", + "tags": [ + "Van Wylde" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Dakota Skye", + "Samantha Hayes", + "Van Wylde" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=f4a624b8-43cb-43b6-80ef-1156cd0a1f09", + "title": "After the Sunset", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Jan 05, 2016", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Cadence Lux", + "href": "https://www.iafd.com/person.rme/id=d54dc6db-7800-4f98-a2f1-ca37682a8810", + "tags": [ + "Cadence Lux", + "LezOnly" + ] + }, + { + "name": "Xandra Sixx", + "href": "https://www.iafd.com/person.rme/id=da1d6ba3-13cd-4dd8-83e3-49a027c3c1fc", + "tags": [ + "Xandra Sixx", + "LezOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Cadence Lux", + "Xandra Sixx" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=189309aa-54ea-4081-820f-5783e08db8d7", + "title": "Afterhours (II)", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Feb 27, 2013", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Aiden Ashley", + "href": "https://www.iafd.com/person.rme/id=4cf532c9-e713-4a93-b455-37c55a7598e3", + "tags": [ + "Aiden Ashley", + "MastOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Aiden Ashley" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=17489ed9-4710-4672-b3fa-fd2e25a70750", + "title": "Afternoon Delight", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Apr 13, 2015", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Denis Reed", + "href": "https://www.iafd.com/person.rme/id=c4eb5784-e962-495e-bcfd-0a4b0370b3a4", + "tags": [ + "Denis Reed" + ] + }, + { + "name": "Olivia Devine", + "href": "https://www.iafd.com/person.rme/id=44a8ec27-3d38-43e3-a505-89832b8f4a7f", + "tags": [ + "Olivia Devine" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Olivia Devine", + "Denis Reed" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=ec88d89a-b538-432c-9d27-b67f4f6c826f", + "title": "Afternoon Fantasy", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Aug 31, 2012", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Ally Summers", + "href": "https://www.iafd.com/person.rme/id=9ae9dbe6-40be-4347-a4f4-7c9c78a4cf72", + "tags": [ + "Ally Summers", + "LezOnly" + ] + }, + { + "name": "Kiki Kandy", + "href": "https://www.iafd.com/person.rme/id=f3fe5d28-4d99-4157-8f96-701049574e18", + "tags": [ + "Kiki Kandy", + "LezOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Ally Summers", + "Kiki Kandy" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=8f398cf1-1009-4997-a264-ef29c96ad608", + "title": "Afternoon Lounge", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Aug 30, 2012", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Alexis Venton", + "href": "https://www.iafd.com/person.rme/id=7a6f6bb6-4e64-41d7-8cbe-5613acbf9bdf", + "tags": [ + "Alexis Venton", + "MastOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Alexis Venton" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=31707f4b-a82f-47cb-b714-83f9a3fa4403", + "title": "Again In The Morning", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nfbusty.com", + "ReleaseDate": "Jul 05, 2022", + "AddedtoIAFDDate": "Jul 6, 2022", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=8673/nfbusty%2ecom.htm", + "Performers": [ + { + "name": "Jay Romero", + "href": "https://www.iafd.com/person.rme/id=35ed8398-192b-44f2-aa90-674b81df9a5d", + "tags": [ + "Jay Romero" + ] + }, + { + "name": "Octavia Red", + "href": "https://www.iafd.com/person.rme/id=071bfa48-1e70-4dfb-bea3-6c591ccc0f08", + "tags": [ + "Octavia Red" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Octavia Red", + "Jay Romero" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=e94c5738-fe9b-4905-b300-d22a553ee4ca", + "title": "Agonizing Release", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Mar 13, 2013", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Maddy O'Reilly", + "href": "https://www.iafd.com/person.rme/id=55c1362a-b07e-4015-8b09-d96b611427f7", + "tags": [ + "Maddy O'Reilly", + "MastOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Maddy O'Reilly" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=f8d8379d-c5bb-4b43-ade4-96e937a08115", + "title": "All About Tonight", + "Minutes": "36", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Feb 16, 2022", + "AddedtoIAFDDate": "Feb 16, 2022", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Kristof Cale", + "href": "https://www.iafd.com/person.rme/id=8fe7d706-53aa-4810-992e-cfb207f15ec5", + "tags": [ + "Kristof Cale" + ] + }, + { + "name": "Marilyn Sugar", + "href": "https://www.iafd.com/person.rme/id=38c9f144-801a-4c82-abbc-d610e5b7648b", + "tags": [ + "Marilyn Sugar", + "Bald Creampie" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Marilyn Sugar", + "Kristof Cale" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=73bb5218-1023-49ba-8c94-3b9fb14fee94", + "title": "All Alone", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "GirlsOnlyPorn.com", + "ReleaseDate": "Oct 28, 2012", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "Yes", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm", + "Performers": [ + { + "name": "Marry Queen", + "href": "https://www.iafd.com/person.rme/id=6410fb45-95e4-4736-b241-be699f16ba93", + "tags": [ + "Marry Queen", + "MastOnly" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Marry Queen" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=0e1b42a0-ccd6-4469-967f-96a10072b68e", + "title": "All Business", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Dec 06, 2015", + "AddedtoIAFDDate": "Aug 2, 2021", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Chloe Amour", + "href": "https://www.iafd.com/person.rme/id=ea8a5be1-1831-4afa-9016-c1a0b8da9f56", + "tags": [ + "Chloe Amour" + ] + }, + { + "name": "Kinsley Eden", + "href": "https://www.iafd.com/person.rme/id=f8d4d70c-2fe8-4242-b2dd-d7508bb546a0", + "tags": [ + "Kinsley Eden" + ] + }, + { + "name": "Ryan Driller", + "href": "https://www.iafd.com/person.rme/id=03462f35-fc7d-48ed-821e-c128b5a330a9", + "tags": [ + "Ryan Driller" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Chloe Amour", + "Kinsley Eden", + "Ryan Driller" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=b3658e59-afa4-45e6-b842-6e26f9b3a867", + "title": "All For Love", + "Minutes": "No Data", + "Distributor": "nubilefilms.com", + "Studio": "hotcrazymess.com", + "ReleaseDate": "Aug 02, 2019", + "AddedtoIAFDDate": "Aug 2, 2019", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=10229/hotcrazymess%2ecom.htm", + "Performers": [ + { + "name": "Amber Addis", + "href": "https://www.iafd.com/person.rme/id=d555438f-422a-4d0b-9a6c-40d7055be2ff", + "tags": [ + "Amber Addis", + "Facial" + ] + }, + { + "name": "Dick Swardson", + "href": "https://www.iafd.com/person.rme/id=4353efa9-d7e5-44f3-a4c3-35d3be3d70c8", + "tags": [ + "Dick Swardson" + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Amber Addis", + "Dick Swardson" + ] + } + ], + "AppearsIn": [] + }, + { + "href": "https://www.iafd.com/title.rme/id=ddf2ef79-543d-4e89-b609-6b9e707dfac4", + "title": "All For Lust", + "Minutes": "22", + "Distributor": "nubilefilms.com", + "Studio": "nubilefilms.com", + "ReleaseDate": "Jan 03, 2019", + "AddedtoIAFDDate": "Jan 4, 2019", + "All-Girl": "No", + "All-Male": "No", + "Compilation": "No", + "Webscene": "Yes", + "Director": "No Data", + "DirectorHref": "", + "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm", + "StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm", + "Performers": [ + { + "name": "Cindy Shine", + "href": "https://www.iafd.com/person.rme/id=d0cbd002-4dfe-419b-90fc-74df1a527e30", + "tags": [ + "Cindy Shine" + ] + }, + { + "name": "Daniel G.", + "href": "https://www.iafd.com/person.rme/id=8d84a7ac-56f9-4cf7-84c6-ad2ef5efac1f", + "tags": [ + "Daniel G." + ] + } + ], + "SceneBreakdowns": [ + { + "scene": "Scene 1", + "performers": [ + "Cindy Shine", + "Daniel G." + ] + } + ], + "AppearsIn": [] + } +] \ No newline at end of file diff --git a/scripts/iafd/result/movie_list.json b/scripts/iafd/result/movie_list.json new file mode 120000 index 0000000..7091603 --- /dev/null +++ b/scripts/iafd/result/movie_list.json @@ -0,0 +1 @@ +distributors.json \ No newline at end of file diff --git a/scripts/iafd/config.py b/scripts/iafd/src_json/config.py similarity index 82% rename from scripts/iafd/config.py rename to scripts/iafd/src_json/config.py index 27942b9..fc7fc09 100644 --- a/scripts/iafd/config.py +++ b/scripts/iafd/src_json/config.py @@ -3,13 +3,8 @@ import os import inspect from datetime import datetime -# MySQL 配置 -db_config = { - 'host': '172.18.0.3', - 'user': 'root', - 'password': 'mysqlpw', - 'database': 'stockdb' -} +global_share_data_dir = '/root/sharedata' +global_host_data_dir = '/root/hostdir/scripts_data' # 设置日志配置 def setup_logging(log_filename=None): @@ -22,7 +17,7 @@ def setup_logging(log_filename=None): # 获取当前日期,格式为 yyyymmdd current_date = datetime.now().strftime('%Y%m%d') # 拼接 log 文件名,将日期加在扩展名前 - log_filename = f'./log/{caller_filename}_{current_date}.log' + log_filename = f'../log/{caller_filename}_{current_date}.log' logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s', handlers=[ diff --git a/scripts/iafd/movie_meta_fetch.py b/scripts/iafd/src_json/movie_detail_fetch.py similarity index 67% rename from scripts/iafd/movie_meta_fetch.py rename to scripts/iafd/src_json/movie_detail_fetch.py index e921621..247dd34 100644 --- a/scripts/iafd/movie_meta_fetch.py +++ b/scripts/iafd/src_json/movie_detail_fetch.py @@ -16,15 +16,18 @@ config.setup_logging() host_url = "https://www.iafd.com" # 目录和文件路径 -RESULT_DIR = "result" +RESULT_DIR = "../result" +OUTPUT_DIR = f"{config.global_share_data_dir}/iafd" INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json") -OUTPUT_JSON = os.path.join(RESULT_DIR, "movie_details.json") -OUTPUT_CSV = os.path.join(RESULT_DIR, "movie_details.csv") +OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json") +OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv") BATCH_SIZE = 100 # 每100条数据写入文件 # 初始化 Cloudflare 绕过工具 scraper = cloudscraper.create_scraper() +# 全量数据 +all_movies = [] def load_existing_data(): """加载已处理的数据,支持续传""" @@ -37,9 +40,10 @@ def load_existing_data(): return [] -def save_data(all_movies): +def save_data(): """保存数据到 JSON 和 CSV 文件""" logging.info("Saving data...") + global all_movies with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(all_movies, f, indent=4, ensure_ascii=False) @@ -53,7 +57,7 @@ def save_data(all_movies): movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"], movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]]) - +# 请求网页并返回 HTML 内容 def fetch_html(href): """请求网页并返回 HTML 内容""" for attempt in range(3): @@ -68,14 +72,13 @@ def fetch_html(href): logging.error(f"Failed to fetch {href} after 3 attempts") return None - +# 解析网页 HTML 并提取电影信息 def parse_movie_details(html, href, title): """解析网页 HTML 并提取电影信息""" soup = BeautifulSoup(html, "html.parser") # 解析电影基础信息 movie_data = {} - director_href = '' info_div = soup.find("div", class_="col-xs-12 col-sm-3") if info_div: labels = info_div.find_all("p", class_="bioheading") @@ -87,8 +90,7 @@ def parse_movie_details(html, href, title): link = value.find("a") if link: val = link.text.strip() - if key == 'Director': - director_href = host_url + link['href'] + movie_data[f'{key}Href'] = host_url + link['href'] movie_data[key] = val else: return None @@ -116,12 +118,53 @@ def parse_movie_details(html, href, title): scene_table = soup.find("div", id="sceneinfo") if scene_table: rows = scene_table.find_all("tr") + for row in rows: cols = row.find_all("td") if len(cols) >= 2: - scene = cols[0].text.strip() - scene_performers = [p.strip() for p in cols[1].text.split(",")] - scene_breakdowns.append({"scene": scene, "performers": scene_performers}) + scene = cols[0].text.strip() # 场景编号 + performer_info = cols[1] # 包含表演者及链接信息 + + # 获取
之前的完整 HTML(保留 标签等格式) + performer_html = str(performer_info) # 获取所有HTML内容 + split_html = performer_html.split("
") # 按
进行分割 + if split_html: + performers_html = split_html[0].strip() # 取
之前的部分 + else: + split_html = performer_html.split("
") # 按
进行分割 + if split_html: + performers_html = split_html[0].strip() # 取
之前的部分 + else: + performers_html = performer_html.strip() # 如果没有
,取全部 + + # 解析为纯文本(去除HTML标签,仅提取文本内容) + performers_soup = BeautifulSoup(performers_html, "html.parser") + performers_text = performers_soup.get_text() + + # 提取表演者 + scene_performers = [p.strip() for p in performers_text.split(",")] + + # 尝试获取 `webscene` 和 `studio` + links_data = {} + links = performer_info.find_all("a") + if links: + webscene_title = links[0].text.strip() if len(links)>0 else None + webscene = links[0]["href"] if len(links)>0 else None + studio = links[1].text.strip() if len(links)>1 else None + studio_lnk = links[1]["href"] if len(links)>1 else None + links_data = { + "title": webscene_title, + "webscene": webscene, + "studio": studio, + "studio_lnk": studio_lnk, + } + + scene_data = { + "scene": scene, + "performers": scene_performers, + **links_data, + } + scene_breakdowns.append(scene_data) appears_in = [] appears_divs = soup.find("div", id="appearssection") @@ -146,7 +189,9 @@ def parse_movie_details(html, href, title): "Compilation": movie_data.get("Compilation", ""), "Webscene": movie_data.get("Webscene", ""), "Director": movie_data.get("Director", ""), - "DirectorHref": director_href, + "DirectorHref": movie_data.get("DirectorHref", ""), + "DistributorHref": movie_data.get("DistributorHref", ""), + "StudioHref": movie_data.get("StudioHref", ""), "Performers": performers, "SceneBreakdowns": scene_breakdowns, "AppearsIn": appears_in, @@ -155,6 +200,7 @@ def parse_movie_details(html, href, title): def process_movies(): """处理电影数据""" + global all_movies all_movies = load_existing_data() processed_hrefs = {movie["href"] for movie in all_movies} @@ -162,7 +208,6 @@ def process_movies(): with open(INPUT_FILE, "r", encoding="utf-8") as f: movies = json.load(f) - new_movies = [] count = 0 for entry in movies: @@ -170,25 +215,32 @@ def process_movies(): title = entry["title"] if href in processed_hrefs: + logging.info(f"Skiping existed: {title} ({href})") continue # 跳过已处理数据 logging.info(f"Processing: {title} ({href})") - html = fetch_html(href) - if not html: - continue # 获取失败,跳过 - - movie = parse_movie_details(html, href, title) - new_movies.append(movie) - count += 1 + while True: + html = fetch_html(href) + if not html: + logging.warning(f'Retring {title} ({href}) ') + continue # 获取失败,跳过 + else: + movie = parse_movie_details(html, href, title) + if not movie: + logging.warning(f'Retring {title} ({href}) ') + continue + else: + all_movies.append(movie) + count += 1 + break # 每 BATCH_SIZE 条数据刷新一次文件 if count % BATCH_SIZE == 0: - save_data(all_movies + new_movies) + save_data() # 最终保存文件 - all_movies.extend(new_movies) - save_data(all_movies) + save_data() logging.info("Task completed.") @@ -219,7 +271,7 @@ def process_one(href): continue # 获取失败,跳过 id = extract_id_from_href(href) - filename = f"{id}.json" # 用 - 替换空格 + filename = f"./log/{id}.json" # 用 - 替换空格 try: with open(filename, 'w', encoding='utf-8') as json_file: @@ -228,7 +280,7 @@ def process_one(href): logging.error(f"Error writing file {filename}: {e}") print(f'fetch succ. saved result in {filename}') - +# 处理程序被终止时的数据 def handle_exit_signal(signal, frame): logging.info("Gracefully exiting... Saving remaining data to Json and CSV.") save_data() @@ -246,6 +298,7 @@ def main(): save_data() logging.info("Data processing completed.") +# 程序入口,读取参数 if __name__ == "__main__": if len(sys.argv) > 1: url = sys.argv[1] diff --git a/scripts/iafd/studios_list_fetch.py b/scripts/iafd/src_json/movie_list_fetch.py similarity index 63% rename from scripts/iafd/studios_list_fetch.py rename to scripts/iafd/src_json/movie_list_fetch.py index 16a62bb..ae57c89 100644 --- a/scripts/iafd/studios_list_fetch.py +++ b/scripts/iafd/src_json/movie_list_fetch.py @@ -28,6 +28,7 @@ import cloudscraper import json import time import csv +import argparse from bs4 import BeautifulSoup import logging import config @@ -36,9 +37,50 @@ config.setup_logging() # 定义基础 URL 和可变参数 host_url = "https://www.iafd.com" -base_url = f"{host_url}/studio.rme/studio=" -list_page_url = f'{base_url}/studio.asp' +# 结果路径 +res_dir = f"{config.global_share_data_dir}/iafd" +fetch_config = { + 'dist': { + 'base_url': f"{host_url}/distrib.rme/distrib=", + 'list_page_url': f"{host_url}/distrib.asp", + 'html_table_id': 'distable', + 'html_select_name': 'Distrib', + 'output_key_id': 'distributors', + 'json_file': f'{res_dir}/distributors.json', + 'csv_file': f'{res_dir}/distributors.csv', + }, + 'stu': { + 'base_url': f"{host_url}/studio.rme/studio=", + 'list_page_url': f"{host_url}/studio.asp", + 'html_table_id': 'studio', + 'html_select_name': 'Studio', + 'output_key_id': 'studios', + 'json_file': f'{res_dir}/studios.json', + 'csv_file': f'{res_dir}/studios.csv', + } +} + +distr_map = { + 6812 : 'nubilefilms.com', + 8563 : 'teenmegaworld network', + 6779 : 'x-art.com', + 7133 : 'tushy.com', + 6496 : 'blacked.com', + 7758 : 'vixen.com', + 6791 : 'teamskeet.com', + 12454: 'vip4k.com', + 13541: 'wow network', + 9702 : 'cum4k.com', + 6778 : 'tiny4k.com', + 12667: 'anal4k.com', + 7419 : 'exotic4k.com', + 13594: 'facials4k.com', + 13633: 'mom4k.com', + 12335: 'slim4k.com', + 16709: 'strippers4k.com', + +} studio_map = { 6812 : 'nubilefilms.com', 9811 : 'Teen Mega World', @@ -65,8 +107,6 @@ headers = { } scraper = cloudscraper.create_scraper() -# 结果路径 -res_dir = './result' all_data = [] # 网络请求并解析 HTML @@ -80,12 +120,15 @@ def fetch_page(url): return None # 解析 HTML 内容,提取需要的数据 -def parse_page(html, name): +def parse_page(html, name, config): + table_id = config['html_table_id'] + key_id = config['output_key_id'] + soup = BeautifulSoup(html, "html.parser") - table = soup.find("table", id="studio") + table = soup.find("table", id=table_id) if not table: - logging.warning(f"Warning: No 'studio' table found in {name}") + logging.warning(f"Warning: No {table_id} table found in {name}") return None # 找到thead并跳过 @@ -109,7 +152,7 @@ def parse_page(html, name): href = host_url + a_href['href'] if a_href else '' all_data.append({ - 'studios': name, + key_id: name, 'title': title, 'label': label, 'year': year, @@ -123,8 +166,10 @@ def handle_pagination(soup, astro): return None # 获取列表页 -def process_list_gage(): - global studio_map +def process_list_gage(config): + list_page_url=config['list_page_url'] + select_name = config['html_select_name'] + list_map = {} logging.info(f"Fetching data for {list_page_url} ...") select_element = None @@ -132,7 +177,7 @@ def process_list_gage(): html = fetch_page(list_page_url) if html: soup = BeautifulSoup(html, "html.parser") - select_element = soup.find('select', {'name': 'Studio'}) + select_element = soup.find('select', {'name': select_name}) if select_element : break else: @@ -147,13 +192,15 @@ def process_list_gage(): for option in options: value = option.get('value') # 获取 value 属性 text = option.text.strip() # 获取文本内容 - studio_map[int(value)] = text - logging.info(f'fetch {list_page_url} succ. total distributors: {len(studio_map)}') - return True + list_map[int(value)] = text + logging.info(f'fetch {list_page_url} succ. total lines: {len(list_map)}') + return list_map # 主逻辑函数:循环处理每个种族 -def process_main_data(): - for key, name in studio_map.items(): +def process_main_data(list_data, config): + base_url = config['base_url'] + + for key, name in list_data.items(): url = base_url + str(key) next_url = url logging.info(f"Fetching data for {name}, url {url} ...") @@ -161,31 +208,48 @@ def process_main_data(): while next_url: html = fetch_page(next_url) if html: - soup = parse_page(html, name) + soup = parse_page(html, name, config) if soup: next_url = handle_pagination(soup, name) else: logging.info(f"wrong html content. retring {next_url} ...") # 定期保存结果 - save_data() + save_data(config) time.sleep(2) # 控制访问频率 else: logging.info(f"Retrying {next_url} ...") time.sleep(5) # 等待后再重试 # 保存到文件 -def save_data(): - with open(f'{res_dir}/studios.json', 'w', encoding='utf-8') as json_file: +def save_data(config): + with open(config['json_file'], 'w', encoding='utf-8') as json_file: json.dump(all_data, json_file, indent=4, ensure_ascii=False) - with open(f'{res_dir}/studios.csv', 'w', newline='', encoding='utf-8') as csv_file: - writer = csv.DictWriter(csv_file, fieldnames=['studios', 'title', 'label', 'year', 'rev', 'href']) + with open(config['csv_file'], 'w', newline='', encoding='utf-8') as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=[config['output_key_id'], 'title', 'label', 'year', 'rev', 'href']) writer.writeheader() writer.writerows(all_data) + # 执行主逻辑 if __name__ == '__main__': - #process_list_gage() - process_main_data() - save_data() - logging.info("Data fetching and saving completed.") \ No newline at end of file + # 命令行参数处理 + parser = argparse.ArgumentParser(description='fetch movie list from iafd.com') + parser.add_argument('--type', type=str, default='dist', help='fetch by ... (dist , stu)') + parser.add_argument('--kind', type=str, default='parts', help='fetch all or parts (parts , all)') + args = parser.parse_args() + + config = fetch_config[args.type] + if not config: + logging.warning(f'unkwon type: {args.type} {args.kind}') + else: + list_data = {} + if args.kind == 'all': + list_data = process_list_gage(config) + elif args.type == 'dist': + list_data = distr_map + else: + list_data = studio_map + + process_main_data(list_data, config) + logging.info("Data fetching and saving completed.") diff --git a/scripts/iafd/detail_fetch.py b/scripts/iafd/src_json/performers_details.py similarity index 99% rename from scripts/iafd/detail_fetch.py rename to scripts/iafd/src_json/performers_details.py index 43c6e38..d04aafc 100644 --- a/scripts/iafd/detail_fetch.py +++ b/scripts/iafd/src_json/performers_details.py @@ -41,7 +41,7 @@ import config config.setup_logging() # 结果路径 -res_dir = './result' +res_dir = '../result' res_json_file = f'{res_dir}/detail.json' res_csv_file = f'{res_dir}/detail.csv' input_json_file = f'{res_dir}/merged.json' diff --git a/scripts/iafd/list_fetch_astro.py b/scripts/iafd/src_json/performers_list_astro.py similarity index 99% rename from scripts/iafd/list_fetch_astro.py rename to scripts/iafd/src_json/performers_list_astro.py index 2fe8ec4..eaf3be6 100644 --- a/scripts/iafd/list_fetch_astro.py +++ b/scripts/iafd/src_json/performers_list_astro.py @@ -46,7 +46,7 @@ headers = { scraper = cloudscraper.create_scraper() # 结果路径 -res_dir = './result' +res_dir = '../result' # 记录 ethinc_map astro_map = [] diff --git a/scripts/iafd/list_fetch_birth.py b/scripts/iafd/src_json/performers_list_birth.py similarity index 99% rename from scripts/iafd/list_fetch_birth.py rename to scripts/iafd/src_json/performers_list_birth.py index 1161ea9..f7d63dc 100644 --- a/scripts/iafd/list_fetch_birth.py +++ b/scripts/iafd/src_json/performers_list_birth.py @@ -43,7 +43,7 @@ headers = { scraper = cloudscraper.create_scraper() # 结果路径 -res_dir = './result' +res_dir = '../result' # 存储出生日期的映射 birth_map = [] diff --git a/scripts/iafd/list_fetch_ethnic.py b/scripts/iafd/src_json/performers_list_ethnic.py similarity index 99% rename from scripts/iafd/list_fetch_ethnic.py rename to scripts/iafd/src_json/performers_list_ethnic.py index a0080e4..e904540 100644 --- a/scripts/iafd/list_fetch_ethnic.py +++ b/scripts/iafd/src_json/performers_list_ethnic.py @@ -46,7 +46,7 @@ headers = { scraper = cloudscraper.create_scraper() # 结果路径 -res_dir = './result' +res_dir = '../result' # 记录 ethinc_map ethnic_map = [] diff --git a/scripts/iafd/list_merge.py b/scripts/iafd/src_json/performers_list_merge.py similarity index 92% rename from scripts/iafd/list_merge.py rename to scripts/iafd/src_json/performers_list_merge.py index bdf79a5..083dccc 100644 --- a/scripts/iafd/list_merge.py +++ b/scripts/iafd/src_json/performers_list_merge.py @@ -30,6 +30,9 @@ import os import argparse from collections import defaultdict +# 结果路径 +res_dir = '../result' + # 读取文件并返回内容 def read_json(file_path): try: @@ -90,9 +93,9 @@ def main(): # 定义需要处理的文件 file_map = { - 'birth': 'result/birth.json', - 'astro': 'result/astro.json', - 'ethnic': 'result/ethnic.json' + 'birth': f'{res_dir}/birth.json', + 'astro': f'{res_dir}/astro.json', + 'ethnic': f'{res_dir}/ethnic.json' } files = [{'path': file_map[file], 'name': file} for file in args.files] @@ -101,11 +104,11 @@ def main(): processed_data = process_data(files) # 根据输入的文件名生成 merged 文件名 - output_json_file = f'result/merged_{"_".join(args.files)}.json' - output_csv_file = f'result/merged_{"_".join(args.files)}.csv' + output_json_file = f'{res_dir}/merged_{"_".join(args.files)}.json' + output_csv_file = f'{res_dir}/merged_{"_".join(args.files)}.csv' # 确保 result 目录存在 - os.makedirs('result', exist_ok=True) + os.makedirs(f'{res_dir}', exist_ok=True) # 输出结果到 JSON 和 CSV 文件 save_to_json(processed_data, output_json_file) diff --git a/scripts/iafd/data_merge.py b/scripts/iafd/tools/data_merge.py similarity index 100% rename from scripts/iafd/data_merge.py rename to scripts/iafd/tools/data_merge.py diff --git a/scripts/iafd/iafd_scrape.py b/scripts/iafd/tools/iafd_scrape.py similarity index 100% rename from scripts/iafd/iafd_scrape.py rename to scripts/iafd/tools/iafd_scrape.py diff --git a/scripts/iafd/stashdb_merge.py b/scripts/iafd/tools/stashdb_merge.py similarity index 100% rename from scripts/iafd/stashdb_merge.py rename to scripts/iafd/tools/stashdb_merge.py