modify some scripts.

This commit is contained in:
2025-03-03 19:01:41 +08:00
parent 8fd48687fc
commit f1e5abd6b3
10 changed files with 1642 additions and 0 deletions

View File

@ -12,6 +12,7 @@ scripts/iafd/data/tmp/
scripts/iafd/result/tmp/ scripts/iafd/result/tmp/
scripts/iafd/result/bak/ scripts/iafd/result/bak/
scripts/iafd/result/performers/ scripts/iafd/result/performers/
scripts/iafd/result/movies/
scripts/iafd/log/ scripts/iafd/log/
scripts/thelordofporn/log/ scripts/thelordofporn/log/
scripts/vixen_group/log/ scripts/vixen_group/log/

View File

@ -0,0 +1,20 @@
{
"href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
"title": "Barebackin' Men",
"Minutes": "No Data",
"Distributor": "1 Distribution",
"Studio": "1 Distribution",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Jan 1, 2006",
"All-Girl": "No",
"All-Male": "Yes",
"Compilation": "No",
"Webscene": "",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
"Performers": [],
"SceneBreakdowns": [],
"AppearsIn": []
}

View File

@ -0,0 +1,56 @@
{
"href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
"title": "Slim Goodies POV 2",
"Minutes": "84",
"Distributor": "Exotic Vixen Films",
"Studio": "Exotic Vixen Films",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Jan 17, 2024",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "",
"Director": "Just Mike Starks",
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
"Performers": [
{
"name": "Amica Mea",
"href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
"tags": [
"Amica Mea"
]
},
{
"name": "Baby Breezy",
"href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
"tags": [
"Baby Breezy"
]
},
{
"name": "Blu Mere",
"href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
"tags": [
"Blu Mere"
]
},
{
"name": "Just Mike Starks",
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"tags": [
"Just Mike Starks"
]
},
{
"name": "Mocha Menage",
"href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
"tags": [
"Mocha Menage"
]
}
],
"SceneBreakdowns": [],
"AppearsIn": []
}

View File

@ -0,0 +1,70 @@
{
"href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
"title": "Atlanta U: College Freaks",
"Minutes": "No Data",
"Distributor": "Exotic Vixen Films",
"Studio": "Exotic Vixen Films",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Sep 19, 2020",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "",
"Director": "Just Mike Starks",
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
"Performers": [
{
"name": "Aaliyah Ali",
"href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
"tags": [
"Aaliyah Ali"
]
},
{
"name": "Bones Montana",
"href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
"tags": [
"Bones Montana"
]
},
{
"name": "Cameron Cox",
"href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
"tags": [
"Cameron Cox"
]
},
{
"name": "Crystal Cooper",
"href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
"tags": [
"Crystal Cooper"
]
},
{
"name": "Jazmine Adore",
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
"tags": [
"Jazmine Adore"
]
},
{
"name": "Just Mike Starks",
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"tags": [
"Just Mike Starks"
]
},
{
"name": "Lala Ivey",
"href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
"tags": [
"Lala Ivey"
]
}
],
"SceneBreakdowns": [],
"AppearsIn": []
}

View File

@ -0,0 +1,85 @@
{
"href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
"title": "Slim Goodies POV",
"Minutes": "61",
"Distributor": "Exotic Vixen Films",
"Studio": "Exotic Vixen Films",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Sep 19, 2020",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "",
"Director": "Just Mike Starks",
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
"Performers": [
{
"name": "Gina Ferrero",
"href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
"tags": [
"Gina Ferrero"
]
},
{
"name": "Imani Reign",
"href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
"tags": [
"Imani Reign"
]
},
{
"name": "Jazmine Adore",
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
"tags": [
"Jazmine Adore"
]
},
{
"name": "Just Mike Starks",
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"tags": [
"Just Mike Starks"
]
},
{
"name": "Niomie King",
"href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
"tags": [
"Niomie King"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Imani Reign",
"Just Mike Starks"
]
},
{
"scene": "Scene 2",
"performers": [
"Jazmine Adore",
"Just Mike Starks"
]
},
{
"scene": "Scene 3",
"performers": [
"Gina Ferrero",
"Just Mike Starks"
]
},
{
"scene": "Scene 4",
"performers": [
"Niomie King",
"Just Mike Starks"
]
}
],
"AppearsIn": []
}

View File

@ -0,0 +1,26 @@
import logging
import os
import inspect
from datetime import datetime
global_share_data_dir = '/root/sharedata'
global_host_data_dir = '/root/hostdir/scripts_data'
# 设置日志配置
def setup_logging(log_filename=None):
# 如果未传入 log_filename则使用当前脚本名称作为日志文件名
if log_filename is None:
# 获取调用 setup_logging 的脚本文件名
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
# 获取当前日期,格式为 yyyymmdd
current_date = datetime.now().strftime('%Y%m%d')
# 拼接 log 文件名,将日期加在扩展名前
log_filename = f'../log/{caller_filename}_{current_date}.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler()
])

320
scripts/iafd/src/fetch.py Normal file
View File

@ -0,0 +1,320 @@
import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as utils
import iafd_scraper as scraper
import utils as func
config.setup_logging()
debug = True
# 按星座获取演员列表,无翻页
def fetch_performers_by_astro(existed_performer_hrefs):
performers = []
for astro in scraper.astro_list:
url = scraper.astr_base_url + astro
logging.info(f"Fetching data for {astro}, url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_astro(soup, astro)
if list_data:
for row in list_data :
if row['href'] not in existed_performer_hrefs:
performers.append({
'person' : row['person'],
'href' : row['href']
})
else:
logging.warning(f'fetch astro error. {url} ...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 调试添加break
if debug:
break
return performers
# 按生日获取演员列表,无翻页
def fetch_performers_by_birth(existed_performer_hrefs):
performers = []
for month in range(1, 13): # 遍历1到12月
for day in range(1, 32): # 遍历1到31天
url = scraper.birth_base_url.format(month=month, day=day)
logging.info(f"Fetching data for birth, url {url}")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_page_birth(soup, month, day)
if list_data:
for row in list_data :
if row['href'] not in existed_performer_hrefs:
performers.append({
'person' : row['person'],
'href' : row['href']
})
else:
logging.warning(f'fetch astro error. {url} ...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 调试添加break
if debug:
return performers
return performers
# 处理带空格的种族名
def format_ethnic(ethnic):
return ethnic.replace(' ', '+')
# 按人种获取演员列表,有翻页
def fetch_performers_by_ethnic(existed_performer_hrefs):
performers = []
for ethnic in scraper.ethnic_list:
url = scraper.ethnic_url + format_ethnic(ethnic)
next_url = url
while next_url:
logging.info(f"Fetching data for {ethnic}, url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
parser="lxml", preprocessor=scraper.preprocess_html)
if soup:
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
if list_data:
for row in list_data :
if row['href'] not in existed_performer_hrefs:
performers.append({
'person' : row['person'],
'href' : row['href']
})
else:
logging.warning(f'fetch astro error. {url} ...')
else:
logging.warning(f'fetch astro error. {url} ...')
# 调试添加break
if debug:
return performers
return performers
# 获取distributors列表
def fetch_distributors_list(existed_distributors_href):
url = scraper.distributors_list_url
distributors_list = []
logging.info(f"Fetching data for distributors list, url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
if list_data:
for row in list_data :
dis_url = scraper.distributors_base_url + row['href']
if dis_url in existed_distributors_href :
continue
distributors_list.append({
'name' : row['name'],
'href' : dis_url
})
else:
logging.warning(f'fetch astro error. {url} ...')
else:
logging.warning(f'fetch astro error. {url} ...')
return distributors_list
# 获取studios列表
def fetch_studios_list(existed_studios_href):
url = scraper.studios_list_url
studios_list = []
logging.info(f"Fetching data for studios list, url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
if list_data:
for row in list_data :
stu_url = scraper.studios_base_url + row['href']
if stu_url in existed_studios_href:
continue
studios_list.append({
'name' : row['name'],
'href' : stu_url
})
else:
logging.warning(f'fetch astro error. {url} ...')
else:
logging.warning(f'fetch astro error. {url} ...')
return studios_list
# 获取更新
def check_update():
# 读取数据库中的演员列表
existed_performer_hrefs = utils.query_performer_hrefs()
if not existed_performer_hrefs:
logging.warning(f'get existed performers from db error.')
return None
# 从列表页获取新的演员
new_performers = []
#new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
#new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
# 逐个获取演员信息并写入到db中
new_performers = list({item["href"]: item for item in new_performers}.values())
logging.info(f'get new performers count: {len(new_performers)} ')
for performer in new_performers:
url = performer['href']
person = performer['person']
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
if soup:
data, credits = scraper.parse_page_performer(soup)
if data:
performer_id = utils.insert_or_update_performer({
'href': url,
'person': person,
**data
})
if performer_id:
logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
else:
logging.warning(f'insert person: {person} {url} failed.')
# 写入到本地json文件
func.write_person_json(person, url, {
'href': url,
'person': person,
**data,
'credits': credits if credits else {}
})
else:
logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
else:
logging.warning(f'fetch_page error. person: {person}, url: {url}')
# 调试break
if debug:
break
# 从数据库读取distributors列表
existed_distributors_href = utils.query_distributor_hrefs()
if existed_distributors_href is None:
logging.warning(f'get existed distributors from db error.')
return
new_distributors = fetch_distributors_list(existed_distributors_href)
for dist in new_distributors:
dist_id = utils.insert_or_update_distributor(dist)
if dist_id:
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
else:
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
# 从数据库读取studios列表
existed_studios_href = utils.query_studio_hrefs()
if existed_studios_href is None:
logging.warning(f'get existed studios from db error.')
return
new_studios = fetch_studios_list(existed_studios_href)
for stu in new_studios:
stu_id = utils.insert_or_update_studio(stu)
if stu_id:
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
else:
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
# 从数据库中读取影片列表
existed_movies = utils.query_movie_hrefs()
if existed_movies is None:
logging.warning(f'load movies from db error')
return
new_movies = []
new_movie_hrefs = []
# 遍历所有 distributors获取 movies 列表
existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
if existed_distributors_href is None:
logging.warning(f'get existed distributors from db error.')
return
for url in existed_distributors_href:
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
if list_data:
for movie in list_data:
if movie['href'] in existed_movies:
continue
new_movies.append({
'title' : movie['title'],
'href' : movie['href']
})
new_movie_hrefs.append(movie['href'])
else :
logging.warning(f'parse_page_movie error. url: {url}')
# 调试增加brak
if debug:
break
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
# 遍历所有 studios获取 movies 列表
existed_studios_href = utils.query_studio_hrefs(name='vixen')
if existed_studios_href is None:
logging.warning(f'get existed studios from db error.')
return
for url in existed_studios_href:
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
if list_data:
for movie in list_data:
if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
continue
new_movies.append({
'title' : movie['title'],
'href' : movie['href']
})
new_movie_hrefs.append(movie['href'])
else :
logging.warning(f'parse_page_movie error. url: {url}')
# 调试增加brak
if debug:
break
logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
# 对新的影片,逐个获取内容
new_movies = list({item["href"]: item for item in new_movies}.values())
logging.info(f'get merged new movies, count: {len(new_movies)} ')
for movie in new_movies:
url = movie['href']
title = movie['title']
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
if soup:
movie_data = scraper.parse_page_movie(soup, url, title)
if movie_data :
movie_id = utils.insert_or_update_movie(movie_data)
if movie_id:
logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
else:
logging.warning(f'insert movie {url} failed.')
# 写入到本地json文件
func.write_movie_json(url, movie_data)
else:
logging.warning(f'parse_page_movie error. url: {url}')
else:
logging.warning(f'fetch_page error. url: {url}')
# 调试增加break
if debug:
break
logging.info(f'all process completed!')
if __name__ == "__main__":
check_update()

View File

@ -0,0 +1,513 @@
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
astr_base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
distributors_list_url = f'{host_url}/distrib.asp'
distributors_base_url = f"{host_url}/distrib.rme/distrib="
studios_list_url = f"{host_url}/studio.asp"
studios_base_url = f"{host_url}/studio.rme/studio="
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
for attempt in range(max_retries):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status() # 处理 HTTP 错误
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None # 达到最大重试次数仍然失败
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 检查电影信息是否存在
def movie_validator(soup, table_id):
return soup.find("table", id=table_id) is not None
# 解析 HTML 内容,提取需要的数据
def parse_page_astro(soup, astro):
astro_div = soup.find("div", id="astro")
if not astro_div:
logging.warning(f"Warning: No 'astro' div found in {astro}")
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = host_url + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
list_data.append({
"astrology": astro,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
flag = True
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析页面内容并更新birth_map
def parse_page_birth(soup, month, day):
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
if not datarows:
return None, None
flag = False
list_cnt = 0
list_data = []
next_url = None
rows = datarows[0].find_all('div', class_='col-sm-4')
for row in rows:
link_tag = row.find('a')
person = link_tag.text.strip() if link_tag else ''
href = link_tag['href'] if link_tag else ''
href = host_url + href
# 如果 href 已经在 birth_map 中,跳过
flag = True
if any(entry['href'] == href for entry in list_data):
continue
# 将数据添加到 birth_map
list_data.append({
'month': month,
'day': day,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_ethnic(soup, ethnic):
rows = soup.find_all('div', class_='row headshotrow')
flag = False
list_data = []
next_url = None
for row in rows:
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
link_tag = col.find('a')
img_tag = col.find('div', class_='pictag')
flag = True
if link_tag and img_tag:
href = host_url + link_tag['href']
person = img_tag.text.strip()
# 将数据存储到 ethnic_map
list_data.append({
'ethnic': ethnic,
'person': person,
'href': href
})
if flag:
logging.debug(f"get {len(list_data)} persons from this page.")
next_page = soup.find('a', rel='next')
if next_page:
next_url = host_url + next_page['href']
logging.debug(f"Found next page: {next_url}")
return list_data, next_url
else:
logging.debug(f"All pages fetched for {ethnic}.")
return list_data, None
else:
return None, None
# 解析列表页
def parse_page_dist_stu_list(soup, select_name):
list_data = []
next_url = None
select_element = soup.find('select', {'name': select_name})
if select_element :
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
list_data.append({
'name' : text,
'href' : str(value)
})
return list_data, next_url
else:
return None, None
# 解析 HTML 内容,提取需要的数据
def parse_page_dist_stu(soup, table_id):
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No {table_id} table found ")
return None, None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
list_data = []
next_url = None
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
list_data.append({
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return list_data, next_url
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'year': year,
'distributor': distributor,
'notes': notes,
'rev': rev,
'formats': formats
})
return movies, distributor_count
# 请求网页并提取所需数据
def parse_page_performer(soup):
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
return data, credits_list
# 解析网页 HTML 并提取电影信息
def parse_page_movie(soup, href, title):
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
performer["tags"] = [
tag.strip() for br in cast.find_all("br")
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
]
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
if __name__ == "__main__":
for astro in astro_list:
url = astr_base_url + astro
next_url = url
logging.info(f"Fetching data for {astro}, url {url} ...")
while True:
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
if soup:
list_data, next_url = parse_page_astro(soup, astro)
if list_data:
print(list_data[0] if len(list_data)>0 else 'no data')
break
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
time.sleep(2) # 控制访问频率

View File

@ -0,0 +1,459 @@
import sqlite3
import json
import config
import utils
import logging
from datetime import datetime
# 连接 SQLite 数据库
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 获取当前时间
def get_current_time():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 插入演员信息
def insert_or_update_performer(data):
try:
cursor.execute("""
INSERT INTO performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
gender = excluded.gender,
birthday = excluded.birthday,
astrology = excluded.astrology,
birthplace = excluded.birthplace,
years_active = excluded.years_active,
ethnicity = excluded.ethnicity,
nationality = excluded.nationality,
hair_colors = excluded.hair_colors,
eye_color = excluded.eye_color,
height_str = excluded.height_str,
weight_str = excluded.weight_str,
measurements = excluded.measurements,
tattoos = excluded.tattoos,
piercings = excluded.piercings,
weight = excluded.weight,
height = excluded.height,
movies_cnt = excluded.movies_cnt,
vixen_cnt = excluded.vixen_cnt,
blacked_cnt = excluded.blacked_cnt,
tushy_cnt = excluded.tushy_cnt,
x_art_cnt = excluded.x_art_cnt,
updated_at = datetime('now', 'localtime')
""", (
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')),
data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
))
# 获取 performer_id
cursor.execute("SELECT id FROM performers WHERE href = ?", (data["href"],))
performer_id = cursor.fetchone()[0]
# 删除旧的 alias
cursor.execute("DELETE FROM performer_aliases WHERE performer_id = ?", (performer_id,))
# 插入新的 alias
for alias in data.get("performer_aka", []):
if alias.lower() != "no known aliases":
cursor.execute("INSERT INTO performer_aliases (performer_id, alias) VALUES (?, ?)", (performer_id, alias))
conn.commit()
logging.debug(f"成功插入/更新演员: {data['person']}")
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# 按 id 或 href 删除演员
def delete_performer(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM performers WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM performers WHERE href = ?", (identifier,))
else:
logging.warning("无效的删除参数")
return
conn.commit()
logging.info(f"成功删除演员: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 按 id、href 或 name 查询演员信息
def query_performer(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM performers WHERE id = ?", (identifier,))
elif "http" in identifier:
cursor.execute("SELECT * FROM performers WHERE href = ?", (identifier,))
else:
cursor.execute("SELECT * FROM performers WHERE name LIKE ?", (f"%{identifier}%",))
performer = cursor.fetchone()
if performer:
cursor.execute("SELECT alias FROM performer_aliases WHERE performer_id = ?", (performer[0],))
aliases = [row[0] for row in cursor.fetchall()]
result = dict(zip([desc[0] for desc in cursor.description], performer))
result["performer_aka"] = aliases
return result
else:
logging.warning(f"未找到演员: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_performer_hrefs(**filters):
try:
sql = "SELECT href FROM performers WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新发行商 """
def insert_or_update_distributor(data):
try:
cursor.execute("""
INSERT INTO distributors (name, href, updated_at)
VALUES (?, ? , datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM distributors WHERE href = ?", (data["href"],))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# 删除发行商(按 id 或 name """
def delete_distributor(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM distributors WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM distributors WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除发行商: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 查询发行商(按 id 或 name """
def query_distributor(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM distributors WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM distributors WHERE name LIKE ?", (f"%{identifier}%",))
distributor = cursor.fetchone()
if distributor:
return dict(zip([desc[0] for desc in cursor.description], distributor))
else:
logging.warning(f"未找到发行商: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_distributor_hrefs(**filters):
try:
sql = "SELECT href FROM distributors WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "url" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# """ 插入或更新制作公司 """
def insert_or_update_studio(data):
try:
cursor.execute("""
INSERT INTO studios (name, href, updated_at)
VALUES (?, ?, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
# 获取 performer_id
cursor.execute("SELECT id FROM studios WHERE href = ?", (data["href"],))
stu_id = cursor.fetchone()[0]
if stu_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return stu_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# """ 删除制作公司(按 id 或 name """
def delete_studio(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM studios WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM studios WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除制作公司: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# """ 查询制作公司(按 id 或 name """
def query_studio(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM studios WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM studios WHERE name LIKE ?", (f"%{identifier}%",))
studio = cursor.fetchone()
if studio:
return dict(zip([desc[0] for desc in cursor.description], studio))
else:
logging.warning(f"未找到制作公司: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_studio_hrefs(**filters):
try:
sql = "SELECT href FROM studios WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# """从指定表中通过 href 查找 id"""
def get_id_by_href(table: str, href: str) -> int:
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
row = cursor.fetchone()
return row[0] if row else None
# """插入或更新电影数据"""
def insert_or_update_movie(movie_data):
try:
# 获取相关 ID
distributor_id = get_id_by_href('distributors', movie_data['DistributorHref'])
studio_id = get_id_by_href('studios', movie_data['StudioHref'])
director_id = get_id_by_href('performers', movie_data['DirectorHref'])
# 插入或更新电影信息
cursor.execute(
"""
INSERT INTO movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
all_girl, all_male, compilation, webscene, director_id, href, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
studio_id=excluded.studio_id, release_date=excluded.release_date,
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
""",
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
)
conn.commit()
logging.info("Movie inserted/updated: %s", movie_data['title'])
# 获取插入的 movie_id
cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
movie_id = cursor.fetchone()[0]
# 插入 performers_movies 关系表
for performer in movie_data.get('Performers', []):
performer_id = get_id_by_href('performers', performer['href'])
if performer_id:
notes = '|'.join(performer['tags'])
cursor.execute(
"""
INSERT INTO performers_movies (performer_id, movie_id, role, notes)
VALUES (?, ?, ?, ?)
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
""",
(performer_id, movie_id, "Actor", notes)
)
logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
else:
logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
# 插入 movies_appers_in 表
for appears in movie_data.get("AppearsIn", []):
appears_in_id = get_id_by_href('movies', appears['href'])
if appears_in_id:
appears_in_id = appears_in_id[0]
cursor.execute("""
INSERT INTO movies_appers_in (movie_id, appears_in_id, gradation, notes)
VALUES (?, ?, ?, ?)
ON CONFLICT(movie_id, appears_in_id) DO NOTHING
""", (movie_id, appears_in_id, 1, appears["title"]))
else:
logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
conn.commit()
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 删除电影数据"""
def delete_movie(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM movies WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM movies WHERE href = ?", (identifier,))
else:
logging.warning("无效的删除参数")
return
conn.commit()
logging.info(f"Deleted movie with {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error("Error deleting movie: %s", e)
# 查找电影数据"""
def query_movies(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM movies WHERE id = ?", (identifier,))
elif "http" in identifier:
cursor.execute("SELECT * FROM movies WHERE href = ?", (identifier,))
else:
cursor.execute("SELECT * FROM movies WHERE title LIKE ?", (f"%{identifier}%",))
movie = cursor.fetchone()
if movie:
cursor.execute("SELECT * FROM performer_movie WHERE performer_id = ?", (movie[0],))
performers = [row[0] for row in cursor.fetchall()]
result = dict(zip([desc[0] for desc in cursor.description], performers))
result["performers"] = performers
return result
else:
logging.warning(f"find no data: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_movie_hrefs(**filters):
try:
sql = "SELECT href FROM movies WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "title" in filters:
sql += " AND title LIKE ?"
params.append(f"%{filters['title']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
if __name__ == "__main__":
try:
with open('../result/detail.json', 'r') as file:
performers = json.load(file)
for performer in performers:
insert_or_update_performer(performer)
print(query_performer("Kirsten"))
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
print(query_performer_hrefs())
except FileNotFoundError:
logging.info("detail.json not found, starting fresh.")

92
scripts/iafd/src/utils.py Normal file
View File

@ -0,0 +1,92 @@
import re
import os
import json
import time
import csv
import logging
# 解析 height 和 weight转换成数字
def parse_height(height_str):
return 0
try:
return int(height_str.split("(")[-1].replace(" cm)", ""))
except:
return None
def parse_weight(weight_str):
return 0
try:
return int(weight_str.split(" ")[0])
except:
return None
update_dir = '../result'
performers_dir = f'{update_dir}/performers'
movies_dir = f'{update_dir}/movies'
def uniq_performers(new_performers):
try:
if not isinstance(new_performers, list):
raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
seen = set()
unique_performers = []
for item in new_performers:
if not item or item['href'] is None:
raise ValueError(f"Invalid item in new_performers: {item}")
if item["href"] not in seen:
seen.add(item["href"])
unique_performers.append(item)
return unique_performers
except Exception as e:
logging.error(f"Error in remove_duplicate_performers: {e}")
return [] # 返回空列表,避免程序崩溃
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
# 获取目录
person_dir = create_sub_directory(performers_dir, person)
person_id = extract_id_from_href(href)
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
# 获取目录
movie_id = extract_id_from_href(href)
person_dir = create_sub_directory(movies_dir, movie_id)
person_filename = f"{movie_id}.json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")