modify iafd scripts.

This commit is contained in:
2025-03-03 09:02:59 +08:00
parent fcbd597c45
commit 8fd48687fc
28 changed files with 1015 additions and 4636 deletions

View File

@ -1,192 +0,0 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/distrib.rme/distrib="
dist_list_url = f'{base_url}/distrib.asp'
distr_map = {
6812 : 'nubilefilms.com',
8563 : 'teenmegaworld network',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
12454: 'vip4k.com',
13541: 'wow network',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
all_data = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, name):
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="distable")
if not table:
logging.warning(f"Warning: No 'distable' table found in {name}")
return None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
global all_data
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
all_data.append({
'distributors': name,
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return soup
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 获取列表页
def process_list_gage():
global distr_map
logging.info(f"Fetching data for {dist_list_url} ...")
select_element = None
while True:
html = fetch_page(dist_list_url)
if html:
soup = BeautifulSoup(html, "html.parser")
select_element = soup.find('select', {'name': 'Distrib'})
if select_element :
break
else:
logging.info(f"wrong html content. retring {dist_list_url} ...")
else:
logging.info(f"wrong html content. retring {dist_list_url} ...")
if not select_element:
return None
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
distr_map[int(value)] = text
logging.info(f'fetch {dist_list_url} succ. total distributors: {len(distr_map)}')
return True
# 主逻辑函数:循环处理每个种族
def process_main_data():
for dis_key, dis_name in distr_map.items():
url = base_url + str(dis_key)
next_url = url
logging.info(f"Fetching data for {dis_name}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, dis_name)
if soup:
next_url = handle_pagination(soup, dis_name)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data():
with open(f'{res_dir}/distributors.json', 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/distributors.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['distributors', 'title', 'label', 'year', 'rev', 'href'])
writer.writeheader()
writer.writerows(all_data)
# 执行主逻辑
if __name__ == '__main__':
#process_list_gage()
process_main_data()
save_data()
logging.info("Data fetching and saving completed.")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,90 +0,0 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import os
import json
import csv
from bs4 import BeautifulSoup
INPUT_DIR = "html"
OUTPUT_JSON = "./result/iafd_meta.json"
OUTPUT_CSV = "./result/iafd_meta.csv"
BASE_URL = "https://www.iafd.com"
def parse_html_file(filepath):
"""解析单个 HTML 文件,提取需要的信息。"""
person_list = []
filename = os.path.basename(filepath)
filename = os.path.splitext(filename)[0]
with open(filepath, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
astro_div = soup.find("div", id="astro")
if not astro_div:
print(f"Warning: No 'astro' div found in {filename}")
return []
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = BASE_URL + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
person_list.append({
"astrology": filename,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
return person_list
def main():
all_persons = []
for filename in os.listdir(INPUT_DIR):
if filename.endswith(".html"):
filepath = os.path.join(INPUT_DIR, filename)
print(f"正在解析 {filename} ...")
all_persons.extend(parse_html_file(filepath))
# 保存 JSON
with open(OUTPUT_JSON, "w", encoding="utf-8") as json_file:
json.dump(all_persons, json_file, indent=4, ensure_ascii=False)
# 保存 CSV
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=["astrology", "birth_date", "person", "href"])
writer.writeheader()
writer.writerows(all_persons)
print(f"Data extracted and saved to {OUTPUT_JSON} and {OUTPUT_CSV}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,20 @@
href,title,Minutes,Distributor,Studio,ReleaseDate,AddedtoIAFDDate,All-Girl,All-Male,Compilation,Webscene,Director
https://www.iafd.com/title.rme/id=aefba072-0133-4b0e-8a88-e2d5ea24ff06,About Last Night,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Mar 15, 2022","Mar 15, 2022",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=d6034e14-b6e0-4a4c-9800-46f00bdde130,Absolute Beauty,No Data,nubilefilms.com,nubilefilms.com,"Sep 28, 2016","Aug 2, 2021",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=8014b300-3307-4d36-828e-b29963742a65,Ace In The Sheets,No Data,nubilefilms.com,nubilefilms.com,"Jun 08, 2020","Jun 9, 2020",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=62990006-e227-4596-852c-d38b3a0923f8,Addicted to Love,No Data,nubilefilms.com,nubilefilms.com,"Mar 25, 2013","Aug 2, 2021",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=cd5a3966-226c-4d72-b7a0-f838104834ec,Adorable (II),No Data,nubilefilms.com,GirlsOnlyPorn.com,"Dec 19, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=9740af6b-1633-481a-8ec0-5d7bade7b404,After Class,No Data,nubilefilms.com,nubilefilms.com,"Jan 10, 2021","Aug 2, 2021",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=eeb5b776-5e42-40eb-b13b-34003c519111,After Party (III),No Data,nubilefilms.com,nubilefilms.com,"Jul 30, 2015","Aug 2, 2021",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=f4a624b8-43cb-43b6-80ef-1156cd0a1f09,After the Sunset,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Jan 05, 2016","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=189309aa-54ea-4081-820f-5783e08db8d7,Afterhours (II),No Data,nubilefilms.com,GirlsOnlyPorn.com,"Feb 27, 2013","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=17489ed9-4710-4672-b3fa-fd2e25a70750,Afternoon Delight,No Data,nubilefilms.com,nubilefilms.com,"Apr 13, 2015","Aug 2, 2021",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=ec88d89a-b538-432c-9d27-b67f4f6c826f,Afternoon Fantasy,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Aug 31, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=8f398cf1-1009-4997-a264-ef29c96ad608,Afternoon Lounge,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Aug 30, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=31707f4b-a82f-47cb-b714-83f9a3fa4403,Again In The Morning,No Data,nubilefilms.com,nfbusty.com,"Jul 05, 2022","Jul 6, 2022",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=e94c5738-fe9b-4905-b300-d22a553ee4ca,Agonizing Release,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Mar 13, 2013","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=f8d8379d-c5bb-4b43-ade4-96e937a08115,All About Tonight,36,nubilefilms.com,nubilefilms.com,"Feb 16, 2022","Feb 16, 2022",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=73bb5218-1023-49ba-8c94-3b9fb14fee94,All Alone,No Data,nubilefilms.com,GirlsOnlyPorn.com,"Oct 28, 2012","Aug 2, 2021",Yes,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=0e1b42a0-ccd6-4469-967f-96a10072b68e,All Business,No Data,nubilefilms.com,nubilefilms.com,"Dec 06, 2015","Aug 2, 2021",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=b3658e59-afa4-45e6-b842-6e26f9b3a867,All For Love,No Data,nubilefilms.com,hotcrazymess.com,"Aug 02, 2019","Aug 2, 2019",No,No,No,Yes,No Data
https://www.iafd.com/title.rme/id=ddf2ef79-543d-4e89-b609-6b9e707dfac4,All For Lust,22,nubilefilms.com,nubilefilms.com,"Jan 03, 2019","Jan 4, 2019",No,No,No,Yes,No Data
1 href title Minutes Distributor Studio ReleaseDate AddedtoIAFDDate All-Girl All-Male Compilation Webscene Director
2 https://www.iafd.com/title.rme/id=aefba072-0133-4b0e-8a88-e2d5ea24ff06 About Last Night No Data nubilefilms.com GirlsOnlyPorn.com Mar 15, 2022 Mar 15, 2022 Yes No No Yes No Data
3 https://www.iafd.com/title.rme/id=d6034e14-b6e0-4a4c-9800-46f00bdde130 Absolute Beauty No Data nubilefilms.com nubilefilms.com Sep 28, 2016 Aug 2, 2021 No No No Yes No Data
4 https://www.iafd.com/title.rme/id=8014b300-3307-4d36-828e-b29963742a65 Ace In The Sheets No Data nubilefilms.com nubilefilms.com Jun 08, 2020 Jun 9, 2020 No No No Yes No Data
5 https://www.iafd.com/title.rme/id=62990006-e227-4596-852c-d38b3a0923f8 Addicted to Love No Data nubilefilms.com nubilefilms.com Mar 25, 2013 Aug 2, 2021 No No No Yes No Data
6 https://www.iafd.com/title.rme/id=cd5a3966-226c-4d72-b7a0-f838104834ec Adorable (II) No Data nubilefilms.com GirlsOnlyPorn.com Dec 19, 2012 Aug 2, 2021 Yes No No Yes No Data
7 https://www.iafd.com/title.rme/id=9740af6b-1633-481a-8ec0-5d7bade7b404 After Class No Data nubilefilms.com nubilefilms.com Jan 10, 2021 Aug 2, 2021 No No No Yes No Data
8 https://www.iafd.com/title.rme/id=eeb5b776-5e42-40eb-b13b-34003c519111 After Party (III) No Data nubilefilms.com nubilefilms.com Jul 30, 2015 Aug 2, 2021 No No No Yes No Data
9 https://www.iafd.com/title.rme/id=f4a624b8-43cb-43b6-80ef-1156cd0a1f09 After the Sunset No Data nubilefilms.com GirlsOnlyPorn.com Jan 05, 2016 Aug 2, 2021 Yes No No Yes No Data
10 https://www.iafd.com/title.rme/id=189309aa-54ea-4081-820f-5783e08db8d7 Afterhours (II) No Data nubilefilms.com GirlsOnlyPorn.com Feb 27, 2013 Aug 2, 2021 Yes No No Yes No Data
11 https://www.iafd.com/title.rme/id=17489ed9-4710-4672-b3fa-fd2e25a70750 Afternoon Delight No Data nubilefilms.com nubilefilms.com Apr 13, 2015 Aug 2, 2021 No No No Yes No Data
12 https://www.iafd.com/title.rme/id=ec88d89a-b538-432c-9d27-b67f4f6c826f Afternoon Fantasy No Data nubilefilms.com GirlsOnlyPorn.com Aug 31, 2012 Aug 2, 2021 Yes No No Yes No Data
13 https://www.iafd.com/title.rme/id=8f398cf1-1009-4997-a264-ef29c96ad608 Afternoon Lounge No Data nubilefilms.com GirlsOnlyPorn.com Aug 30, 2012 Aug 2, 2021 Yes No No Yes No Data
14 https://www.iafd.com/title.rme/id=31707f4b-a82f-47cb-b714-83f9a3fa4403 Again In The Morning No Data nubilefilms.com nfbusty.com Jul 05, 2022 Jul 6, 2022 No No No Yes No Data
15 https://www.iafd.com/title.rme/id=e94c5738-fe9b-4905-b300-d22a553ee4ca Agonizing Release No Data nubilefilms.com GirlsOnlyPorn.com Mar 13, 2013 Aug 2, 2021 Yes No No Yes No Data
16 https://www.iafd.com/title.rme/id=f8d8379d-c5bb-4b43-ade4-96e937a08115 All About Tonight 36 nubilefilms.com nubilefilms.com Feb 16, 2022 Feb 16, 2022 No No No Yes No Data
17 https://www.iafd.com/title.rme/id=73bb5218-1023-49ba-8c94-3b9fb14fee94 All Alone No Data nubilefilms.com GirlsOnlyPorn.com Oct 28, 2012 Aug 2, 2021 Yes No No Yes No Data
18 https://www.iafd.com/title.rme/id=0e1b42a0-ccd6-4469-967f-96a10072b68e All Business No Data nubilefilms.com nubilefilms.com Dec 06, 2015 Aug 2, 2021 No No No Yes No Data
19 https://www.iafd.com/title.rme/id=b3658e59-afa4-45e6-b842-6e26f9b3a867 All For Love No Data nubilefilms.com hotcrazymess.com Aug 02, 2019 Aug 2, 2019 No No No Yes No Data
20 https://www.iafd.com/title.rme/id=ddf2ef79-543d-4e89-b609-6b9e707dfac4 All For Lust 22 nubilefilms.com nubilefilms.com Jan 03, 2019 Jan 4, 2019 No No No Yes No Data

View File

@ -0,0 +1,809 @@
[
{
"href": "https://www.iafd.com/title.rme/id=aefba072-0133-4b0e-8a88-e2d5ea24ff06",
"title": "About Last Night",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Mar 15, 2022",
"AddedtoIAFDDate": "Mar 15, 2022",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Alex Coal",
"href": "https://www.iafd.com/person.rme/id=a731a328-f69d-4ac5-a82f-81e010341895",
"tags": [
"Alex Coal",
"LezOnly Bald"
]
},
{
"name": "Liz Jordan",
"href": "https://www.iafd.com/person.rme/id=d2b9fa26-3062-4b90-a1dd-10f3ef26fb68",
"tags": [
"Liz Jordan",
"LezOnly Bald"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Alex Coal",
"Liz Jordan"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=d6034e14-b6e0-4a4c-9800-46f00bdde130",
"title": "Absolute Beauty",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Sep 28, 2016",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Aisha",
"href": "https://www.iafd.com/person.rme/id=20d3df80-22d1-4375-a8e5-7b276f25d8d3",
"tags": [
"Aisha"
]
},
{
"name": "Nick Ross",
"href": "https://www.iafd.com/person.rme/id=0a566505-1e62-467a-ba79-3e45b5f540de",
"tags": [
"Nick Ross"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Aisha",
"Nick Ross"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=8014b300-3307-4d36-828e-b29963742a65",
"title": "Ace In The Sheets",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Jun 08, 2020",
"AddedtoIAFDDate": "Jun 9, 2020",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Codey Steele",
"href": "https://www.iafd.com/person.rme/id=4666d258-5053-401b-bcc6-22737ff15a89",
"tags": [
"Codey Steele"
]
},
{
"name": "Karla Kush",
"href": "https://www.iafd.com/person.rme/id=91e5e13c-e5de-4d0a-83c8-d00a19959ffd",
"tags": [
"Karla Kush"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Karla Kush",
"Codey Steele"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=62990006-e227-4596-852c-d38b3a0923f8",
"title": "Addicted to Love",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Mar 25, 2013",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Logan Pierce",
"href": "https://www.iafd.com/person.rme/id=ec02e34e-1f30-489d-934b-e1fb22531a6e",
"tags": [
"Logan Pierce"
]
},
{
"name": "Presley Hart",
"href": "https://www.iafd.com/person.rme/id=8497863b-5a34-4594-bd4d-8188ac909a05",
"tags": [
"Presley Hart"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Presley Hart",
"Logan Pierce"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=cd5a3966-226c-4d72-b7a0-f838104834ec",
"title": "Adorable (II)",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Dec 19, 2012",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Beata Undine",
"href": "https://www.iafd.com/person.rme/id=46d59a10-988e-453b-9ed3-aafa1a0a3b5a",
"tags": [
"Beata Undine",
"MastOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Beata Undine"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=9740af6b-1633-481a-8ec0-5d7bade7b404",
"title": "After Class",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Jan 10, 2021",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Alex Coal",
"href": "https://www.iafd.com/person.rme/id=a731a328-f69d-4ac5-a82f-81e010341895",
"tags": [
"Alex Coal",
"Creampie"
]
},
{
"name": "Stirling Cooper",
"href": "https://www.iafd.com/person.rme/id=a983951b-87d5-4dd9-a5c7-597e0d9e59f4",
"tags": [
"Stirling Cooper"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Alex Coal",
"Stirling Cooper"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=eeb5b776-5e42-40eb-b13b-34003c519111",
"title": "After Party (III)",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Jul 30, 2015",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Dakota Skye",
"href": "https://www.iafd.com/person.rme/id=24d4ce7c-777f-4303-b122-9167b15100a8",
"tags": [
"Dakota Skye"
]
},
{
"name": "Samantha Hayes",
"href": "https://www.iafd.com/person.rme/id=9fc8d827-52bc-43d2-8eee-d57a7274745d",
"tags": [
"Samantha Hayes"
]
},
{
"name": "Van Wylde",
"href": "https://www.iafd.com/person.rme/id=61980288-9c48-4875-b319-0cef87f26ee6",
"tags": [
"Van Wylde"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Dakota Skye",
"Samantha Hayes",
"Van Wylde"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=f4a624b8-43cb-43b6-80ef-1156cd0a1f09",
"title": "After the Sunset",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Jan 05, 2016",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Cadence Lux",
"href": "https://www.iafd.com/person.rme/id=d54dc6db-7800-4f98-a2f1-ca37682a8810",
"tags": [
"Cadence Lux",
"LezOnly"
]
},
{
"name": "Xandra Sixx",
"href": "https://www.iafd.com/person.rme/id=da1d6ba3-13cd-4dd8-83e3-49a027c3c1fc",
"tags": [
"Xandra Sixx",
"LezOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Cadence Lux",
"Xandra Sixx"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=189309aa-54ea-4081-820f-5783e08db8d7",
"title": "Afterhours (II)",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Feb 27, 2013",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Aiden Ashley",
"href": "https://www.iafd.com/person.rme/id=4cf532c9-e713-4a93-b455-37c55a7598e3",
"tags": [
"Aiden Ashley",
"MastOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Aiden Ashley"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=17489ed9-4710-4672-b3fa-fd2e25a70750",
"title": "Afternoon Delight",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Apr 13, 2015",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Denis Reed",
"href": "https://www.iafd.com/person.rme/id=c4eb5784-e962-495e-bcfd-0a4b0370b3a4",
"tags": [
"Denis Reed"
]
},
{
"name": "Olivia Devine",
"href": "https://www.iafd.com/person.rme/id=44a8ec27-3d38-43e3-a505-89832b8f4a7f",
"tags": [
"Olivia Devine"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Olivia Devine",
"Denis Reed"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=ec88d89a-b538-432c-9d27-b67f4f6c826f",
"title": "Afternoon Fantasy",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Aug 31, 2012",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Ally Summers",
"href": "https://www.iafd.com/person.rme/id=9ae9dbe6-40be-4347-a4f4-7c9c78a4cf72",
"tags": [
"Ally Summers",
"LezOnly"
]
},
{
"name": "Kiki Kandy",
"href": "https://www.iafd.com/person.rme/id=f3fe5d28-4d99-4157-8f96-701049574e18",
"tags": [
"Kiki Kandy",
"LezOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Ally Summers",
"Kiki Kandy"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=8f398cf1-1009-4997-a264-ef29c96ad608",
"title": "Afternoon Lounge",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Aug 30, 2012",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Alexis Venton",
"href": "https://www.iafd.com/person.rme/id=7a6f6bb6-4e64-41d7-8cbe-5613acbf9bdf",
"tags": [
"Alexis Venton",
"MastOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Alexis Venton"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=31707f4b-a82f-47cb-b714-83f9a3fa4403",
"title": "Again In The Morning",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nfbusty.com",
"ReleaseDate": "Jul 05, 2022",
"AddedtoIAFDDate": "Jul 6, 2022",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=8673/nfbusty%2ecom.htm",
"Performers": [
{
"name": "Jay Romero",
"href": "https://www.iafd.com/person.rme/id=35ed8398-192b-44f2-aa90-674b81df9a5d",
"tags": [
"Jay Romero"
]
},
{
"name": "Octavia Red",
"href": "https://www.iafd.com/person.rme/id=071bfa48-1e70-4dfb-bea3-6c591ccc0f08",
"tags": [
"Octavia Red"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Octavia Red",
"Jay Romero"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=e94c5738-fe9b-4905-b300-d22a553ee4ca",
"title": "Agonizing Release",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Mar 13, 2013",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Maddy O'Reilly",
"href": "https://www.iafd.com/person.rme/id=55c1362a-b07e-4015-8b09-d96b611427f7",
"tags": [
"Maddy O'Reilly",
"MastOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Maddy O'Reilly"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=f8d8379d-c5bb-4b43-ade4-96e937a08115",
"title": "All About Tonight",
"Minutes": "36",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Feb 16, 2022",
"AddedtoIAFDDate": "Feb 16, 2022",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Kristof Cale",
"href": "https://www.iafd.com/person.rme/id=8fe7d706-53aa-4810-992e-cfb207f15ec5",
"tags": [
"Kristof Cale"
]
},
{
"name": "Marilyn Sugar",
"href": "https://www.iafd.com/person.rme/id=38c9f144-801a-4c82-abbc-d610e5b7648b",
"tags": [
"Marilyn Sugar",
"Bald Creampie"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Marilyn Sugar",
"Kristof Cale"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=73bb5218-1023-49ba-8c94-3b9fb14fee94",
"title": "All Alone",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "GirlsOnlyPorn.com",
"ReleaseDate": "Oct 28, 2012",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "Yes",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12666/girlsonlyporn%2ecom.htm",
"Performers": [
{
"name": "Marry Queen",
"href": "https://www.iafd.com/person.rme/id=6410fb45-95e4-4736-b241-be699f16ba93",
"tags": [
"Marry Queen",
"MastOnly"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Marry Queen"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=0e1b42a0-ccd6-4469-967f-96a10072b68e",
"title": "All Business",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Dec 06, 2015",
"AddedtoIAFDDate": "Aug 2, 2021",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Chloe Amour",
"href": "https://www.iafd.com/person.rme/id=ea8a5be1-1831-4afa-9016-c1a0b8da9f56",
"tags": [
"Chloe Amour"
]
},
{
"name": "Kinsley Eden",
"href": "https://www.iafd.com/person.rme/id=f8d4d70c-2fe8-4242-b2dd-d7508bb546a0",
"tags": [
"Kinsley Eden"
]
},
{
"name": "Ryan Driller",
"href": "https://www.iafd.com/person.rme/id=03462f35-fc7d-48ed-821e-c128b5a330a9",
"tags": [
"Ryan Driller"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Chloe Amour",
"Kinsley Eden",
"Ryan Driller"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=b3658e59-afa4-45e6-b842-6e26f9b3a867",
"title": "All For Love",
"Minutes": "No Data",
"Distributor": "nubilefilms.com",
"Studio": "hotcrazymess.com",
"ReleaseDate": "Aug 02, 2019",
"AddedtoIAFDDate": "Aug 2, 2019",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=10229/hotcrazymess%2ecom.htm",
"Performers": [
{
"name": "Amber Addis",
"href": "https://www.iafd.com/person.rme/id=d555438f-422a-4d0b-9a6c-40d7055be2ff",
"tags": [
"Amber Addis",
"Facial"
]
},
{
"name": "Dick Swardson",
"href": "https://www.iafd.com/person.rme/id=4353efa9-d7e5-44f3-a4c3-35d3be3d70c8",
"tags": [
"Dick Swardson"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Amber Addis",
"Dick Swardson"
]
}
],
"AppearsIn": []
},
{
"href": "https://www.iafd.com/title.rme/id=ddf2ef79-543d-4e89-b609-6b9e707dfac4",
"title": "All For Lust",
"Minutes": "22",
"Distributor": "nubilefilms.com",
"Studio": "nubilefilms.com",
"ReleaseDate": "Jan 03, 2019",
"AddedtoIAFDDate": "Jan 4, 2019",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "Yes",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=6812/nubilefilms%2ecom.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=6812/nubilefilms%2ecom.htm",
"Performers": [
{
"name": "Cindy Shine",
"href": "https://www.iafd.com/person.rme/id=d0cbd002-4dfe-419b-90fc-74df1a527e30",
"tags": [
"Cindy Shine"
]
},
{
"name": "Daniel G.",
"href": "https://www.iafd.com/person.rme/id=8d84a7ac-56f9-4cf7-84c6-ad2ef5efac1f",
"tags": [
"Daniel G."
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Cindy Shine",
"Daniel G."
]
}
],
"AppearsIn": []
}
]

View File

@ -0,0 +1 @@
distributors.json

View File

@ -3,13 +3,8 @@ import os
import inspect
from datetime import datetime
# MySQL 配置
db_config = {
'host': '172.18.0.3',
'user': 'root',
'password': 'mysqlpw',
'database': 'stockdb'
}
global_share_data_dir = '/root/sharedata'
global_host_data_dir = '/root/hostdir/scripts_data'
# 设置日志配置
def setup_logging(log_filename=None):
@ -22,7 +17,7 @@ def setup_logging(log_filename=None):
# 获取当前日期,格式为 yyyymmdd
current_date = datetime.now().strftime('%Y%m%d')
# 拼接 log 文件名,将日期加在扩展名前
log_filename = f'./log/{caller_filename}_{current_date}.log'
log_filename = f'../log/{caller_filename}_{current_date}.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
handlers=[

View File

@ -16,15 +16,18 @@ config.setup_logging()
host_url = "https://www.iafd.com"
# 目录和文件路径
RESULT_DIR = "result"
RESULT_DIR = "../result"
OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
OUTPUT_JSON = os.path.join(RESULT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(RESULT_DIR, "movie_details.csv")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
BATCH_SIZE = 100 # 每100条数据写入文件
# 初始化 Cloudflare 绕过工具
scraper = cloudscraper.create_scraper()
# 全量数据
all_movies = []
def load_existing_data():
"""加载已处理的数据,支持续传"""
@ -37,9 +40,10 @@ def load_existing_data():
return []
def save_data(all_movies):
def save_data():
"""保存数据到 JSON 和 CSV 文件"""
logging.info("Saving data...")
global all_movies
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(all_movies, f, indent=4, ensure_ascii=False)
@ -53,7 +57,7 @@ def save_data(all_movies):
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
# 请求网页并返回 HTML 内容
def fetch_html(href):
"""请求网页并返回 HTML 内容"""
for attempt in range(3):
@ -68,14 +72,13 @@ def fetch_html(href):
logging.error(f"Failed to fetch {href} after 3 attempts")
return None
# 解析网页 HTML 并提取电影信息
def parse_movie_details(html, href, title):
"""解析网页 HTML 并提取电影信息"""
soup = BeautifulSoup(html, "html.parser")
# 解析电影基础信息
movie_data = {}
director_href = ''
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
@ -87,8 +90,7 @@ def parse_movie_details(html, href, title):
link = value.find("a")
if link:
val = link.text.strip()
if key == 'Director':
director_href = host_url + link['href']
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
@ -116,12 +118,53 @@ def parse_movie_details(html, href, title):
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip()
scene_performers = [p.strip() for p in cols[1].text.split(",")]
scene_breakdowns.append({"scene": scene, "performers": scene_performers})
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
@ -146,7 +189,9 @@ def parse_movie_details(html, href, title):
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": director_href,
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
@ -155,6 +200,7 @@ def parse_movie_details(html, href, title):
def process_movies():
"""处理电影数据"""
global all_movies
all_movies = load_existing_data()
processed_hrefs = {movie["href"] for movie in all_movies}
@ -162,7 +208,6 @@ def process_movies():
with open(INPUT_FILE, "r", encoding="utf-8") as f:
movies = json.load(f)
new_movies = []
count = 0
for entry in movies:
@ -170,25 +215,32 @@ def process_movies():
title = entry["title"]
if href in processed_hrefs:
logging.info(f"Skiping existed: {title} ({href})")
continue # 跳过已处理数据
logging.info(f"Processing: {title} ({href})")
html = fetch_html(href)
if not html:
continue # 获取失败,跳过
movie = parse_movie_details(html, href, title)
new_movies.append(movie)
count += 1
while True:
html = fetch_html(href)
if not html:
logging.warning(f'Retring {title} ({href}) ')
continue # 获取失败,跳过
else:
movie = parse_movie_details(html, href, title)
if not movie:
logging.warning(f'Retring {title} ({href}) ')
continue
else:
all_movies.append(movie)
count += 1
break
# 每 BATCH_SIZE 条数据刷新一次文件
if count % BATCH_SIZE == 0:
save_data(all_movies + new_movies)
save_data()
# 最终保存文件
all_movies.extend(new_movies)
save_data(all_movies)
save_data()
logging.info("Task completed.")
@ -219,7 +271,7 @@ def process_one(href):
continue # 获取失败,跳过
id = extract_id_from_href(href)
filename = f"{id}.json" # 用 - 替换空格
filename = f"./log/{id}.json" # 用 - 替换空格
try:
with open(filename, 'w', encoding='utf-8') as json_file:
@ -228,7 +280,7 @@ def process_one(href):
logging.error(f"Error writing file {filename}: {e}")
print(f'fetch succ. saved result in {filename}')
# 处理程序被终止时的数据
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
save_data()
@ -246,6 +298,7 @@ def main():
save_data()
logging.info("Data processing completed.")
# 程序入口,读取参数
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]

View File

@ -28,6 +28,7 @@ import cloudscraper
import json
import time
import csv
import argparse
from bs4 import BeautifulSoup
import logging
import config
@ -36,9 +37,50 @@ config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/studio.rme/studio="
list_page_url = f'{base_url}/studio.asp'
# 结果路径
res_dir = f"{config.global_share_data_dir}/iafd"
fetch_config = {
'dist': {
'base_url': f"{host_url}/distrib.rme/distrib=",
'list_page_url': f"{host_url}/distrib.asp",
'html_table_id': 'distable',
'html_select_name': 'Distrib',
'output_key_id': 'distributors',
'json_file': f'{res_dir}/distributors.json',
'csv_file': f'{res_dir}/distributors.csv',
},
'stu': {
'base_url': f"{host_url}/studio.rme/studio=",
'list_page_url': f"{host_url}/studio.asp",
'html_table_id': 'studio',
'html_select_name': 'Studio',
'output_key_id': 'studios',
'json_file': f'{res_dir}/studios.json',
'csv_file': f'{res_dir}/studios.csv',
}
}
distr_map = {
6812 : 'nubilefilms.com',
8563 : 'teenmegaworld network',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
12454: 'vip4k.com',
13541: 'wow network',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
studio_map = {
6812 : 'nubilefilms.com',
9811 : 'Teen Mega World',
@ -65,8 +107,6 @@ headers = {
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
all_data = []
# 网络请求并解析 HTML
@ -80,12 +120,15 @@ def fetch_page(url):
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, name):
def parse_page(html, name, config):
table_id = config['html_table_id']
key_id = config['output_key_id']
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="studio")
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No 'studio' table found in {name}")
logging.warning(f"Warning: No {table_id} table found in {name}")
return None
# 找到thead并跳过
@ -109,7 +152,7 @@ def parse_page(html, name):
href = host_url + a_href['href'] if a_href else ''
all_data.append({
'studios': name,
key_id: name,
'title': title,
'label': label,
'year': year,
@ -123,8 +166,10 @@ def handle_pagination(soup, astro):
return None
# 获取列表页
def process_list_gage():
global studio_map
def process_list_gage(config):
list_page_url=config['list_page_url']
select_name = config['html_select_name']
list_map = {}
logging.info(f"Fetching data for {list_page_url} ...")
select_element = None
@ -132,7 +177,7 @@ def process_list_gage():
html = fetch_page(list_page_url)
if html:
soup = BeautifulSoup(html, "html.parser")
select_element = soup.find('select', {'name': 'Studio'})
select_element = soup.find('select', {'name': select_name})
if select_element :
break
else:
@ -147,13 +192,15 @@ def process_list_gage():
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
studio_map[int(value)] = text
logging.info(f'fetch {list_page_url} succ. total distributors: {len(studio_map)}')
return True
list_map[int(value)] = text
logging.info(f'fetch {list_page_url} succ. total lines: {len(list_map)}')
return list_map
# 主逻辑函数:循环处理每个种族
def process_main_data():
for key, name in studio_map.items():
def process_main_data(list_data, config):
base_url = config['base_url']
for key, name in list_data.items():
url = base_url + str(key)
next_url = url
logging.info(f"Fetching data for {name}, url {url} ...")
@ -161,31 +208,48 @@ def process_main_data():
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, name)
soup = parse_page(html, name, config)
if soup:
next_url = handle_pagination(soup, name)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data()
save_data(config)
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data():
with open(f'{res_dir}/studios.json', 'w', encoding='utf-8') as json_file:
def save_data(config):
with open(config['json_file'], 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/studios.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['studios', 'title', 'label', 'year', 'rev', 'href'])
with open(config['csv_file'], 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=[config['output_key_id'], 'title', 'label', 'year', 'rev', 'href'])
writer.writeheader()
writer.writerows(all_data)
# 执行主逻辑
if __name__ == '__main__':
#process_list_gage()
process_main_data()
save_data()
logging.info("Data fetching and saving completed.")
# 命令行参数处理
parser = argparse.ArgumentParser(description='fetch movie list from iafd.com')
parser.add_argument('--type', type=str, default='dist', help='fetch by ... (dist , stu)')
parser.add_argument('--kind', type=str, default='parts', help='fetch all or parts (parts , all)')
args = parser.parse_args()
config = fetch_config[args.type]
if not config:
logging.warning(f'unkwon type: {args.type} {args.kind}')
else:
list_data = {}
if args.kind == 'all':
list_data = process_list_gage(config)
elif args.type == 'dist':
list_data = distr_map
else:
list_data = studio_map
process_main_data(list_data, config)
logging.info("Data fetching and saving completed.")

View File

@ -41,7 +41,7 @@ import config
config.setup_logging()
# 结果路径
res_dir = './result'
res_dir = '../result'
res_json_file = f'{res_dir}/detail.json'
res_csv_file = f'{res_dir}/detail.csv'
input_json_file = f'{res_dir}/merged.json'

View File

@ -46,7 +46,7 @@ headers = {
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
res_dir = '../result'
# 记录 ethinc_map
astro_map = []

View File

@ -43,7 +43,7 @@ headers = {
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
res_dir = '../result'
# 存储出生日期的映射
birth_map = []

View File

@ -46,7 +46,7 @@ headers = {
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = './result'
res_dir = '../result'
# 记录 ethinc_map
ethnic_map = []

View File

@ -30,6 +30,9 @@ import os
import argparse
from collections import defaultdict
# 结果路径
res_dir = '../result'
# 读取文件并返回内容
def read_json(file_path):
try:
@ -90,9 +93,9 @@ def main():
# 定义需要处理的文件
file_map = {
'birth': 'result/birth.json',
'astro': 'result/astro.json',
'ethnic': 'result/ethnic.json'
'birth': f'{res_dir}/birth.json',
'astro': f'{res_dir}/astro.json',
'ethnic': f'{res_dir}/ethnic.json'
}
files = [{'path': file_map[file], 'name': file} for file in args.files]
@ -101,11 +104,11 @@ def main():
processed_data = process_data(files)
# 根据输入的文件名生成 merged 文件名
output_json_file = f'result/merged_{"_".join(args.files)}.json'
output_csv_file = f'result/merged_{"_".join(args.files)}.csv'
output_json_file = f'{res_dir}/merged_{"_".join(args.files)}.json'
output_csv_file = f'{res_dir}/merged_{"_".join(args.files)}.csv'
# 确保 result 目录存在
os.makedirs('result', exist_ok=True)
os.makedirs(f'{res_dir}', exist_ok=True)
# 输出结果到 JSON 和 CSV 文件
save_to_json(processed_data, output_json_file)