diff --git a/gitignore b/gitignore
index a57550b..d482c08 100644
--- a/gitignore
+++ b/gitignore
@@ -12,6 +12,7 @@ scripts/iafd/data/tmp/
scripts/iafd/result/tmp/
scripts/iafd/result/bak/
scripts/iafd/result/performers/
+scripts/iafd/result/movies/
scripts/iafd/log/
scripts/thelordofporn/log/
scripts/vixen_group/log/
diff --git a/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json
new file mode 100644
index 0000000..e04eb58
--- /dev/null
+++ b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json
@@ -0,0 +1,20 @@
+{
+ "href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
+ "title": "Barebackin' Men",
+ "Minutes": "No Data",
+ "Distributor": "1 Distribution",
+ "Studio": "1 Distribution",
+ "ReleaseDate": "No Data",
+ "AddedtoIAFDDate": "Jan 1, 2006",
+ "All-Girl": "No",
+ "All-Male": "Yes",
+ "Compilation": "No",
+ "Webscene": "",
+ "Director": "No Data",
+ "DirectorHref": "",
+ "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
+ "StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
+ "Performers": [],
+ "SceneBreakdowns": [],
+ "AppearsIn": []
+}
\ No newline at end of file
diff --git a/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json
new file mode 100644
index 0000000..51f224d
--- /dev/null
+++ b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json
@@ -0,0 +1,56 @@
+{
+ "href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
+ "title": "Slim Goodies POV 2",
+ "Minutes": "84",
+ "Distributor": "Exotic Vixen Films",
+ "Studio": "Exotic Vixen Films",
+ "ReleaseDate": "No Data",
+ "AddedtoIAFDDate": "Jan 17, 2024",
+ "All-Girl": "No",
+ "All-Male": "No",
+ "Compilation": "No",
+ "Webscene": "",
+ "Director": "Just Mike Starks",
+ "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+ "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
+ "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
+ "Performers": [
+ {
+ "name": "Amica Mea",
+ "href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
+ "tags": [
+ "Amica Mea"
+ ]
+ },
+ {
+ "name": "Baby Breezy",
+ "href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
+ "tags": [
+ "Baby Breezy"
+ ]
+ },
+ {
+ "name": "Blu Mere",
+ "href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
+ "tags": [
+ "Blu Mere"
+ ]
+ },
+ {
+ "name": "Just Mike Starks",
+ "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+ "tags": [
+ "Just Mike Starks"
+ ]
+ },
+ {
+ "name": "Mocha Menage",
+ "href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
+ "tags": [
+ "Mocha Menage"
+ ]
+ }
+ ],
+ "SceneBreakdowns": [],
+ "AppearsIn": []
+}
\ No newline at end of file
diff --git a/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json
new file mode 100644
index 0000000..94cc88d
--- /dev/null
+++ b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json
@@ -0,0 +1,70 @@
+{
+ "href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
+ "title": "Atlanta U: College Freaks",
+ "Minutes": "No Data",
+ "Distributor": "Exotic Vixen Films",
+ "Studio": "Exotic Vixen Films",
+ "ReleaseDate": "No Data",
+ "AddedtoIAFDDate": "Sep 19, 2020",
+ "All-Girl": "No",
+ "All-Male": "No",
+ "Compilation": "No",
+ "Webscene": "",
+ "Director": "Just Mike Starks",
+ "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+ "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
+ "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
+ "Performers": [
+ {
+ "name": "Aaliyah Ali",
+ "href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
+ "tags": [
+ "Aaliyah Ali"
+ ]
+ },
+ {
+ "name": "Bones Montana",
+ "href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
+ "tags": [
+ "Bones Montana"
+ ]
+ },
+ {
+ "name": "Cameron Cox",
+ "href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
+ "tags": [
+ "Cameron Cox"
+ ]
+ },
+ {
+ "name": "Crystal Cooper",
+ "href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
+ "tags": [
+ "Crystal Cooper"
+ ]
+ },
+ {
+ "name": "Jazmine Adore",
+ "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
+ "tags": [
+ "Jazmine Adore"
+ ]
+ },
+ {
+ "name": "Just Mike Starks",
+ "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+ "tags": [
+ "Just Mike Starks"
+ ]
+ },
+ {
+ "name": "Lala Ivey",
+ "href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
+ "tags": [
+ "Lala Ivey"
+ ]
+ }
+ ],
+ "SceneBreakdowns": [],
+ "AppearsIn": []
+}
\ No newline at end of file
diff --git a/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json
new file mode 100644
index 0000000..08af9d1
--- /dev/null
+++ b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json
@@ -0,0 +1,85 @@
+{
+ "href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
+ "title": "Slim Goodies POV",
+ "Minutes": "61",
+ "Distributor": "Exotic Vixen Films",
+ "Studio": "Exotic Vixen Films",
+ "ReleaseDate": "No Data",
+ "AddedtoIAFDDate": "Sep 19, 2020",
+ "All-Girl": "No",
+ "All-Male": "No",
+ "Compilation": "No",
+ "Webscene": "",
+ "Director": "Just Mike Starks",
+ "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+ "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
+ "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
+ "Performers": [
+ {
+ "name": "Gina Ferrero",
+ "href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
+ "tags": [
+ "Gina Ferrero"
+ ]
+ },
+ {
+ "name": "Imani Reign",
+ "href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
+ "tags": [
+ "Imani Reign"
+ ]
+ },
+ {
+ "name": "Jazmine Adore",
+ "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
+ "tags": [
+ "Jazmine Adore"
+ ]
+ },
+ {
+ "name": "Just Mike Starks",
+ "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+ "tags": [
+ "Just Mike Starks"
+ ]
+ },
+ {
+ "name": "Niomie King",
+ "href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
+ "tags": [
+ "Niomie King"
+ ]
+ }
+ ],
+ "SceneBreakdowns": [
+ {
+ "scene": "Scene 1",
+ "performers": [
+ "Imani Reign",
+ "Just Mike Starks"
+ ]
+ },
+ {
+ "scene": "Scene 2",
+ "performers": [
+ "Jazmine Adore",
+ "Just Mike Starks"
+ ]
+ },
+ {
+ "scene": "Scene 3",
+ "performers": [
+ "Gina Ferrero",
+ "Just Mike Starks"
+ ]
+ },
+ {
+ "scene": "Scene 4",
+ "performers": [
+ "Niomie King",
+ "Just Mike Starks"
+ ]
+ }
+ ],
+ "AppearsIn": []
+}
\ No newline at end of file
diff --git a/scripts/iafd/src/config.py b/scripts/iafd/src/config.py
new file mode 100644
index 0000000..fc7fc09
--- /dev/null
+++ b/scripts/iafd/src/config.py
@@ -0,0 +1,26 @@
+import logging
+import os
+import inspect
+from datetime import datetime
+
+global_share_data_dir = '/root/sharedata'
+global_host_data_dir = '/root/hostdir/scripts_data'
+
+# 设置日志配置
+def setup_logging(log_filename=None):
+ # 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
+ if log_filename is None:
+ # 获取调用 setup_logging 的脚本文件名
+ caller_frame = inspect.stack()[1]
+ caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+
+ # 获取当前日期,格式为 yyyymmdd
+ current_date = datetime.now().strftime('%Y%m%d')
+ # 拼接 log 文件名,将日期加在扩展名前
+ log_filename = f'../log/{caller_filename}_{current_date}.log'
+
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+ handlers=[
+ logging.FileHandler(log_filename),
+ logging.StreamHandler()
+ ])
\ No newline at end of file
diff --git a/scripts/iafd/src/fetch.py b/scripts/iafd/src/fetch.py
new file mode 100644
index 0000000..4d670b3
--- /dev/null
+++ b/scripts/iafd/src/fetch.py
@@ -0,0 +1,320 @@
+
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as utils
+import iafd_scraper as scraper
+import utils as func
+
+config.setup_logging()
+
+debug = True
+
+# 按星座获取演员列表,无翻页
+def fetch_performers_by_astro(existed_performer_hrefs):
+ performers = []
+
+ for astro in scraper.astro_list:
+ url = scraper.astr_base_url + astro
+ logging.info(f"Fetching data for {astro}, url {url} ...")
+
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
+ if soup:
+ list_data, next_url = scraper.parse_page_astro(soup, astro)
+ if list_data:
+ for row in list_data :
+ if row['href'] not in existed_performer_hrefs:
+ performers.append({
+ 'person' : row['person'],
+ 'href' : row['href']
+ })
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+
+ # 调试添加break
+ if debug:
+ break
+ return performers
+
+
+# 按生日获取演员列表,无翻页
+def fetch_performers_by_birth(existed_performer_hrefs):
+ performers = []
+
+ for month in range(1, 13): # 遍历1到12月
+ for day in range(1, 32): # 遍历1到31天
+ url = scraper.birth_base_url.format(month=month, day=day)
+ logging.info(f"Fetching data for birth, url {url}")
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
+ if soup:
+ list_data, next_url = scraper.parse_page_birth(soup, month, day)
+ if list_data:
+ for row in list_data :
+ if row['href'] not in existed_performer_hrefs:
+ performers.append({
+ 'person' : row['person'],
+ 'href' : row['href']
+ })
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+
+ # 调试添加break
+ if debug:
+ return performers
+
+ return performers
+
+# 处理带空格的种族名
+def format_ethnic(ethnic):
+ return ethnic.replace(' ', '+')
+
+# 按人种获取演员列表,有翻页
+def fetch_performers_by_ethnic(existed_performer_hrefs):
+ performers = []
+
+ for ethnic in scraper.ethnic_list:
+ url = scraper.ethnic_url + format_ethnic(ethnic)
+ next_url = url
+
+ while next_url:
+ logging.info(f"Fetching data for {ethnic}, url {url} ...")
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
+ parser="lxml", preprocessor=scraper.preprocess_html)
+ if soup:
+ list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
+ if list_data:
+ for row in list_data :
+ if row['href'] not in existed_performer_hrefs:
+ performers.append({
+ 'person' : row['person'],
+ 'href' : row['href']
+ })
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+
+ # 调试添加break
+ if debug:
+ return performers
+ return performers
+
+
+# 获取distributors列表
+def fetch_distributors_list(existed_distributors_href):
+ url = scraper.distributors_list_url
+ distributors_list = []
+
+ logging.info(f"Fetching data for distributors list, url {url} ...")
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
+ if soup:
+ list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
+ if list_data:
+ for row in list_data :
+ dis_url = scraper.distributors_base_url + row['href']
+ if dis_url in existed_distributors_href :
+ continue
+ distributors_list.append({
+ 'name' : row['name'],
+ 'href' : dis_url
+ })
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ return distributors_list
+
+# 获取studios列表
+def fetch_studios_list(existed_studios_href):
+ url = scraper.studios_list_url
+ studios_list = []
+
+ logging.info(f"Fetching data for studios list, url {url} ...")
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
+ if soup:
+ list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
+ if list_data:
+ for row in list_data :
+ stu_url = scraper.studios_base_url + row['href']
+ if stu_url in existed_studios_href:
+ continue
+ studios_list.append({
+ 'name' : row['name'],
+ 'href' : stu_url
+ })
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ else:
+ logging.warning(f'fetch astro error. {url} ...')
+ return studios_list
+
+# 获取更新
+def check_update():
+ # 读取数据库中的演员列表
+ existed_performer_hrefs = utils.query_performer_hrefs()
+ if not existed_performer_hrefs:
+ logging.warning(f'get existed performers from db error.')
+ return None
+
+ # 从列表页获取新的演员
+ new_performers = []
+ #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
+ #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
+ new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
+
+ # 逐个获取演员信息,并写入到db中
+ new_performers = list({item["href"]: item for item in new_performers}.values())
+ logging.info(f'get new performers count: {len(new_performers)} ')
+ for performer in new_performers:
+ url = performer['href']
+ person = performer['person']
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
+ if soup:
+ data, credits = scraper.parse_page_performer(soup)
+ if data:
+ performer_id = utils.insert_or_update_performer({
+ 'href': url,
+ 'person': person,
+ **data
+ })
+ if performer_id:
+ logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
+ else:
+ logging.warning(f'insert person: {person} {url} failed.')
+
+ # 写入到本地json文件
+ func.write_person_json(person, url, {
+ 'href': url,
+ 'person': person,
+ **data,
+ 'credits': credits if credits else {}
+ })
+ else:
+ logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
+ else:
+ logging.warning(f'fetch_page error. person: {person}, url: {url}')
+ # 调试break
+ if debug:
+ break
+
+ # 从数据库读取distributors列表
+ existed_distributors_href = utils.query_distributor_hrefs()
+ if existed_distributors_href is None:
+ logging.warning(f'get existed distributors from db error.')
+ return
+ new_distributors = fetch_distributors_list(existed_distributors_href)
+ for dist in new_distributors:
+ dist_id = utils.insert_or_update_distributor(dist)
+ if dist_id:
+ logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
+ else:
+ logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
+
+ # 从数据库读取studios列表
+ existed_studios_href = utils.query_studio_hrefs()
+ if existed_studios_href is None:
+ logging.warning(f'get existed studios from db error.')
+ return
+ new_studios = fetch_studios_list(existed_studios_href)
+ for stu in new_studios:
+ stu_id = utils.insert_or_update_studio(stu)
+ if stu_id:
+ logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
+ else:
+ logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
+
+ # 从数据库中读取影片列表
+ existed_movies = utils.query_movie_hrefs()
+ if existed_movies is None:
+ logging.warning(f'load movies from db error')
+ return
+ new_movies = []
+ new_movie_hrefs = []
+
+ # 遍历所有 distributors,获取 movies 列表
+ existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
+ if existed_distributors_href is None:
+ logging.warning(f'get existed distributors from db error.')
+ return
+ for url in existed_distributors_href:
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
+ if soup:
+ list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
+ if list_data:
+ for movie in list_data:
+ if movie['href'] in existed_movies:
+ continue
+ new_movies.append({
+ 'title' : movie['title'],
+ 'href' : movie['href']
+ })
+ new_movie_hrefs.append(movie['href'])
+ else :
+ logging.warning(f'parse_page_movie error. url: {url}')
+ # 调试增加brak
+ if debug:
+ break
+ logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
+
+ # 遍历所有 studios,获取 movies 列表
+ existed_studios_href = utils.query_studio_hrefs(name='vixen')
+ if existed_studios_href is None:
+ logging.warning(f'get existed studios from db error.')
+ return
+ for url in existed_studios_href:
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
+ if soup:
+ list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
+ if list_data:
+ for movie in list_data:
+ if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
+ continue
+ new_movies.append({
+ 'title' : movie['title'],
+ 'href' : movie['href']
+ })
+ new_movie_hrefs.append(movie['href'])
+ else :
+ logging.warning(f'parse_page_movie error. url: {url}')
+ # 调试增加brak
+ if debug:
+ break
+ logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
+
+ # 对新的影片,逐个获取内容
+ new_movies = list({item["href"]: item for item in new_movies}.values())
+ logging.info(f'get merged new movies, count: {len(new_movies)} ')
+ for movie in new_movies:
+ url = movie['href']
+ title = movie['title']
+ soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
+ if soup:
+ movie_data = scraper.parse_page_movie(soup, url, title)
+ if movie_data :
+ movie_id = utils.insert_or_update_movie(movie_data)
+ if movie_id:
+ logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
+ else:
+ logging.warning(f'insert movie {url} failed.')
+
+ # 写入到本地json文件
+ func.write_movie_json(url, movie_data)
+ else:
+ logging.warning(f'parse_page_movie error. url: {url}')
+ else:
+ logging.warning(f'fetch_page error. url: {url}')
+ # 调试增加break
+ if debug:
+ break
+
+ logging.info(f'all process completed!')
+if __name__ == "__main__":
+ check_update()
\ No newline at end of file
diff --git a/scripts/iafd/src/iafd_scraper.py b/scripts/iafd/src/iafd_scraper.py
new file mode 100644
index 0000000..1575b24
--- /dev/null
+++ b/scripts/iafd/src/iafd_scraper.py
@@ -0,0 +1,513 @@
+
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+import config
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+astr_base_url = f"{host_url}/astrology.rme/sign="
+astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
+
+birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
+
+ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
+ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
+
+distributors_list_url = f'{host_url}/distrib.asp'
+distributors_base_url = f"{host_url}/distrib.rme/distrib="
+
+studios_list_url = f"{host_url}/studio.asp"
+studios_base_url = f"{host_url}/studio.rme/studio="
+
+# 设置 headers 和 scraper
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+ for attempt in range(max_retries):
+ try:
+ response = scraper.get(url, headers=headers)
+ response.raise_for_status() # 处理 HTTP 错误
+
+ # 预处理 HTML(如果提供了 preprocessor)
+ html_text = preprocessor(response.text) if preprocessor else response.text
+
+ soup = BeautifulSoup(html_text, parser)
+ if validator(soup): # 进行自定义页面检查
+ return soup
+
+ logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+ except cloudscraper.exceptions.CloudflareChallengeError as e:
+ logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+ except cloudscraper.exceptions.CloudflareCode1020 as e:
+ logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+ except Exception as e:
+ logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+ logging.error(f'Fetching failed after max retries. {url}')
+ return None # 达到最大重试次数仍然失败
+
+# 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要
+def preprocess_html(html):
+ return html.replace('
', '').replace('= 5:
+ title = cols[0].text.strip()
+ label = cols[1].text.strip()
+ year = cols[2].text.strip()
+ rev = cols[3].text.strip()
+ a_href = cols[0].find('a')
+ href = host_url + a_href['href'] if a_href else ''
+
+ list_data.append({
+ 'title': title,
+ 'label': label,
+ 'year': year,
+ 'rev': rev,
+ 'href': href
+ })
+ return list_data, next_url
+
+
+# 解析 作品列表,有个人出演,也有导演的
+def parse_credits_table(table, distributor_list):
+ # 找到thead并跳过
+ thead = table.find('thead')
+ if thead:
+ thead.decompose() # 去掉thead部分,不需要解析
+
+ # 现在只剩下tbody部分
+ tbody = table.find('tbody')
+ rows = tbody.find_all('tr') if tbody else []
+
+ movies = []
+ distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
+
+ # rows = table.find_all('tr', class_='we')
+ for row in rows:
+ cols = row.find_all('td')
+ if len(cols) >= 6:
+ title = cols[0].text.strip()
+ year = cols[1].text.strip()
+ distributor = cols[2].text.strip().lower()
+ notes = cols[3].text.strip()
+ rev = cols[4].text.strip()
+ formats = cols[5].text.strip()
+
+ for key in distributor_list:
+ if key in distributor:
+ distributor_count[key] += 1
+
+ movies.append({
+ 'title': title,
+ 'year': year,
+ 'distributor': distributor,
+ 'notes': notes,
+ 'rev': rev,
+ 'formats': formats
+ })
+ return movies, distributor_count
+
+
+# 请求网页并提取所需数据
+def parse_page_performer(soup):
+ # 提取数据
+ data = {}
+
+ # 定义我们需要的字段名称和HTML中对应的标签
+ fields = {
+ 'performer_aka': 'Performer AKA',
+ 'birthday': 'Birthday',
+ 'astrology': 'Astrology',
+ 'birthplace': 'Birthplace',
+ 'gender': 'Gender',
+ 'years_active': 'Years Active',
+ 'ethnicity': 'Ethnicity',
+ 'nationality': 'Nationality',
+ 'hair_colors': 'Hair Colors',
+ 'eye_color': 'Eye Color',
+ 'height': 'Height',
+ 'weight': 'Weight',
+ 'measurements': 'Measurements',
+ 'tattoos': 'Tattoos',
+ 'piercings': 'Piercings'
+ }
+ reversed_map = {v: k for k, v in fields.items()}
+
+ # 解析表格数据, 获取参演或者导演的列表
+ role_list = ['personal', 'directoral']
+ distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
+ credits_list = {}
+
+ # 使用字典来存储统计
+ distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
+ for role in role_list:
+ table = soup.find('table', id=role)
+ if table :
+ movies, stat_map = parse_credits_table(table, distributor_list)
+ credits_list[role] = movies
+ # 更新 distributor 统计
+ for distributor in distributor_list:
+ distributor_count[distributor] += stat_map.get(distributor, 0)
+
+ # 统计 movies 数量
+ #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+ movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
+
+ # 如果没有找到
+ if len(credits_list) == 0 :
+ logging.warning(f"movie table empty. url: {url} ")
+
+ # 遍历每个 bioheading, 获取metadata
+ bioheadings = soup.find_all('p', class_='bioheading')
+ for bio in bioheadings:
+ heading = bio.text.strip()
+ biodata = None
+
+ # 如果包含 "Performer",需要特殊处理
+ if 'Performer' in heading:
+ heading = 'Performer AKA'
+ biodata_div = bio.find_next('div', class_='biodata')
+ if biodata_div:
+ div_text = biodata_div.get_text(separator='|').strip()
+ biodata = [b.strip() for b in div_text.split('|') if b.strip()]
+ else:
+ biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
+
+ # 保存数据
+ if heading in reversed_map:
+ kkey = reversed_map[heading]
+ data[kkey] = biodata
+
+ # 添加统计数据到 data
+ data['movies_cnt'] = movies_cnt
+ data['vixen_cnt'] = distributor_count['vixen']
+ data['blacked_cnt'] = distributor_count['blacked']
+ data['tushy_cnt'] = distributor_count['tushy']
+ data['x_art_cnt'] = distributor_count['x-art']
+
+ return data, credits_list
+
+
+
+# 解析网页 HTML 并提取电影信息
+def parse_page_movie(soup, href, title):
+ # 解析电影基础信息
+ movie_data = {}
+ info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+ if info_div:
+ labels = info_div.find_all("p", class_="bioheading")
+ values = info_div.find_all("p", class_="biodata")
+ for label, value in zip(labels, values):
+ key = label.text.strip()
+ val = value.text.strip()
+ if key in ["Distributor", "Studio", "Director"]:
+ link = value.find("a")
+ if link:
+ val = link.text.strip()
+ movie_data[f'{key}Href'] = host_url + link['href']
+ movie_data[key] = val
+ else:
+ return None
+
+ # 解析演职人员信息
+ performers = []
+ cast_divs = soup.find_all("div", class_="castbox")
+ for cast in cast_divs:
+ performer = {}
+ link = cast.find("a")
+ if link:
+ performer["name"] = link.text.strip()
+ performer["href"] = host_url + link["href"]
+
+ performer["tags"] = [
+ tag.strip() for br in cast.find_all("br")
+ if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+ ]
+
+ #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+ performers.append(performer)
+
+ # 解析场景拆解
+ scene_breakdowns = []
+ scene_table = soup.find("div", id="sceneinfo")
+ if scene_table:
+ rows = scene_table.find_all("tr")
+
+ for row in rows:
+ cols = row.find_all("td")
+ if len(cols) >= 2:
+ scene = cols[0].text.strip() # 场景编号
+ performer_info = cols[1] # 包含表演者及链接信息
+
+ # 获取
之前的完整 HTML(保留 标签等格式)
+ performer_html = str(performer_info) # 获取所有HTML内容
+ split_html = performer_html.split("
") # 按
进行分割
+ if split_html:
+ performers_html = split_html[0].strip() # 取
之前的部分
+ else:
+ split_html = performer_html.split("
") # 按
进行分割
+ if split_html:
+ performers_html = split_html[0].strip() # 取
之前的部分
+ else:
+ performers_html = performer_html.strip() # 如果没有
,取全部
+
+ # 解析为纯文本(去除HTML标签,仅提取文本内容)
+ performers_soup = BeautifulSoup(performers_html, "html.parser")
+ performers_text = performers_soup.get_text()
+
+ # 提取表演者
+ scene_performers = [p.strip() for p in performers_text.split(",")]
+
+ # 尝试获取 `webscene` 和 `studio`
+ links_data = {}
+ links = performer_info.find_all("a")
+ if links:
+ webscene_title = links[0].text.strip() if len(links)>0 else None
+ webscene = links[0]["href"] if len(links)>0 else None
+ studio = links[1].text.strip() if len(links)>1 else None
+ studio_lnk = links[1]["href"] if len(links)>1 else None
+ links_data = {
+ "title": webscene_title,
+ "webscene": webscene,
+ "studio": studio,
+ "studio_lnk": studio_lnk,
+ }
+
+ scene_data = {
+ "scene": scene,
+ "performers": scene_performers,
+ **links_data,
+ }
+ scene_breakdowns.append(scene_data)
+
+ appears_in = []
+ appears_divs = soup.find("div", id="appearssection")
+ if appears_divs:
+ rows = appears_divs.find_all("li")
+ for row in rows:
+ lnk = row.find("a")
+ if lnk:
+ appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+ return {
+ "href": href,
+ "title": title,
+ "Minutes": movie_data.get("Minutes", ""),
+ "Distributor": movie_data.get("Distributor", ""),
+ "Studio": movie_data.get("Studio", ""),
+ "ReleaseDate": movie_data.get("Release Date", ""),
+ "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+ "All-Girl": movie_data.get("All-Girl", ""),
+ "All-Male": movie_data.get("All-Male", ""),
+ "Compilation": movie_data.get("Compilation", ""),
+ "Webscene": movie_data.get("Webscene", ""),
+ "Director": movie_data.get("Director", ""),
+ "DirectorHref": movie_data.get("DirectorHref", ""),
+ "DistributorHref": movie_data.get("DistributorHref", ""),
+ "StudioHref": movie_data.get("StudioHref", ""),
+ "Performers": performers,
+ "SceneBreakdowns": scene_breakdowns,
+ "AppearsIn": appears_in,
+ }
+
+
+if __name__ == "__main__":
+
+ for astro in astro_list:
+ url = astr_base_url + astro
+ next_url = url
+ logging.info(f"Fetching data for {astro}, url {url} ...")
+
+ while True:
+ soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
+ if soup:
+ list_data, next_url = parse_page_astro(soup, astro)
+ if list_data:
+ print(list_data[0] if len(list_data)>0 else 'no data')
+ break
+ else:
+ logging.info(f"Retrying {next_url} ...")
+ time.sleep(5) # 等待后再重试
+
+ time.sleep(2) # 控制访问频率
\ No newline at end of file
diff --git a/scripts/iafd/src/sqlite_utils.py b/scripts/iafd/src/sqlite_utils.py
new file mode 100644
index 0000000..d2f0c10
--- /dev/null
+++ b/scripts/iafd/src/sqlite_utils.py
@@ -0,0 +1,459 @@
+import sqlite3
+import json
+import config
+import utils
+import logging
+from datetime import datetime
+
+# 连接 SQLite 数据库
+DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH)
+cursor = conn.cursor()
+
+# 获取当前时间
+def get_current_time():
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+# 插入演员信息
+def insert_or_update_performer(data):
+ try:
+ cursor.execute("""
+ INSERT INTO performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
+ eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
+ blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
+ ON CONFLICT(href) DO UPDATE SET
+ name = excluded.name,
+ gender = excluded.gender,
+ birthday = excluded.birthday,
+ astrology = excluded.astrology,
+ birthplace = excluded.birthplace,
+ years_active = excluded.years_active,
+ ethnicity = excluded.ethnicity,
+ nationality = excluded.nationality,
+ hair_colors = excluded.hair_colors,
+ eye_color = excluded.eye_color,
+ height_str = excluded.height_str,
+ weight_str = excluded.weight_str,
+ measurements = excluded.measurements,
+ tattoos = excluded.tattoos,
+ piercings = excluded.piercings,
+ weight = excluded.weight,
+ height = excluded.height,
+ movies_cnt = excluded.movies_cnt,
+ vixen_cnt = excluded.vixen_cnt,
+ blacked_cnt = excluded.blacked_cnt,
+ tushy_cnt = excluded.tushy_cnt,
+ x_art_cnt = excluded.x_art_cnt,
+ updated_at = datetime('now', 'localtime')
+ """, (
+ data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
+ data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
+ data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')),
+ data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
+ ))
+
+ # 获取 performer_id
+ cursor.execute("SELECT id FROM performers WHERE href = ?", (data["href"],))
+ performer_id = cursor.fetchone()[0]
+
+ # 删除旧的 alias
+ cursor.execute("DELETE FROM performer_aliases WHERE performer_id = ?", (performer_id,))
+
+ # 插入新的 alias
+ for alias in data.get("performer_aka", []):
+ if alias.lower() != "no known aliases":
+ cursor.execute("INSERT INTO performer_aliases (performer_id, alias) VALUES (?, ?)", (performer_id, alias))
+
+ conn.commit()
+ logging.debug(f"成功插入/更新演员: {data['person']}")
+ return performer_id
+
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error(f"数据库错误: {e}")
+ return None
+ except Exception as e:
+ conn.rollback()
+ logging.error(f"未知错误: {e}")
+ return None
+
+# 按 id 或 href 删除演员
+def delete_performer(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("DELETE FROM performers WHERE id = ?", (identifier,))
+ elif isinstance(identifier, str):
+ cursor.execute("DELETE FROM performers WHERE href = ?", (identifier,))
+ else:
+ logging.warning("无效的删除参数")
+ return
+ conn.commit()
+ logging.info(f"成功删除演员: {identifier}")
+
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error(f"删除失败: {e}")
+
+# 按 id、href 或 name 查询演员信息
+def query_performer(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("SELECT * FROM performers WHERE id = ?", (identifier,))
+ elif "http" in identifier:
+ cursor.execute("SELECT * FROM performers WHERE href = ?", (identifier,))
+ else:
+ cursor.execute("SELECT * FROM performers WHERE name LIKE ?", (f"%{identifier}%",))
+
+ performer = cursor.fetchone()
+ if performer:
+ cursor.execute("SELECT alias FROM performer_aliases WHERE performer_id = ?", (performer[0],))
+ aliases = [row[0] for row in cursor.fetchall()]
+ result = dict(zip([desc[0] for desc in cursor.description], performer))
+ result["performer_aka"] = aliases
+ return result
+ else:
+ logging.warning(f"未找到演员: {identifier}")
+ return None
+
+ except sqlite3.Error as e:
+ logging.error(f"查询失败: {e}")
+ return None
+
+# 按条件查询 href 列表
+def query_performer_hrefs(**filters):
+ try:
+ sql = "SELECT href FROM performers WHERE 1=1"
+ params = []
+
+ if "id" in filters:
+ sql += " AND id = ?"
+ params.append(filters["id"])
+ if "href" in filters:
+ sql += " AND href = ?"
+ params.append(filters["href"])
+ if "name" in filters:
+ sql += " AND name LIKE ?"
+ params.append(f"%{filters['name']}%")
+
+ cursor.execute(sql, params)
+ return [row[0] for row in cursor.fetchall()]
+
+ except sqlite3.Error as e:
+ logging.error(f"查询 href 失败: {e}")
+ return None
+
+
+# 插入或更新发行商 """
+def insert_or_update_distributor(data):
+ try:
+ cursor.execute("""
+ INSERT INTO distributors (name, href, updated_at)
+ VALUES (?, ? , datetime('now', 'localtime'))
+ ON CONFLICT(href) DO UPDATE SET
+ name = excluded.name,
+ updated_at = datetime('now', 'localtime')
+ """, (data["name"], data["href"]))
+ conn.commit()
+
+ # 获取 performer_id
+ cursor.execute("SELECT id FROM distributors WHERE href = ?", (data["href"],))
+ dist_id = cursor.fetchone()[0]
+ if dist_id:
+ logging.debug(f"成功插入/更新发行商: {data['name']}")
+ return dist_id
+ else:
+ return None
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error(f"数据库错误: {e}")
+ return None
+
+# 删除发行商(按 id 或 name) """
+def delete_distributor(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("DELETE FROM distributors WHERE id = ?", (identifier,))
+ elif isinstance(identifier, str):
+ cursor.execute("DELETE FROM distributors WHERE name = ?", (identifier,))
+ conn.commit()
+ logging.info(f"成功删除发行商: {identifier}")
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error(f"删除失败: {e}")
+
+# 查询发行商(按 id 或 name) """
+def query_distributor(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("SELECT * FROM distributors WHERE id = ?", (identifier,))
+ else:
+ cursor.execute("SELECT * FROM distributors WHERE name LIKE ?", (f"%{identifier}%",))
+
+ distributor = cursor.fetchone()
+ if distributor:
+ return dict(zip([desc[0] for desc in cursor.description], distributor))
+ else:
+ logging.warning(f"未找到发行商: {identifier}")
+ return None
+ except sqlite3.Error as e:
+ logging.error(f"查询失败: {e}")
+ return None
+
+# 按条件查询 href 列表
+def query_distributor_hrefs(**filters):
+ try:
+ sql = "SELECT href FROM distributors WHERE 1=1"
+ params = []
+
+ if "id" in filters:
+ sql += " AND id = ?"
+ params.append(filters["id"])
+ if "url" in filters:
+ sql += " AND href = ?"
+ params.append(filters["href"])
+ if "name" in filters:
+ sql += " AND name LIKE ?"
+ params.append(f"%{filters['name']}%")
+
+ cursor.execute(sql, params)
+ return [row[0] for row in cursor.fetchall()]
+
+ except sqlite3.Error as e:
+ logging.error(f"查询 href 失败: {e}")
+ return None
+
+# """ 插入或更新制作公司 """
+def insert_or_update_studio(data):
+ try:
+ cursor.execute("""
+ INSERT INTO studios (name, href, updated_at)
+ VALUES (?, ?, datetime('now', 'localtime'))
+ ON CONFLICT(href) DO UPDATE SET
+ name = excluded.name,
+ updated_at = datetime('now', 'localtime')
+ """, (data["name"], data["href"]))
+ conn.commit()
+
+ # 获取 performer_id
+ cursor.execute("SELECT id FROM studios WHERE href = ?", (data["href"],))
+ stu_id = cursor.fetchone()[0]
+ if stu_id:
+ logging.debug(f"成功插入/更新发行商: {data['name']}")
+ return stu_id
+ else:
+ return None
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error(f"数据库错误: {e}")
+ return None
+
+# """ 删除制作公司(按 id 或 name) """
+def delete_studio(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("DELETE FROM studios WHERE id = ?", (identifier,))
+ elif isinstance(identifier, str):
+ cursor.execute("DELETE FROM studios WHERE name = ?", (identifier,))
+ conn.commit()
+ logging.info(f"成功删除制作公司: {identifier}")
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error(f"删除失败: {e}")
+
+# """ 查询制作公司(按 id 或 name) """
+def query_studio(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("SELECT * FROM studios WHERE id = ?", (identifier,))
+ else:
+ cursor.execute("SELECT * FROM studios WHERE name LIKE ?", (f"%{identifier}%",))
+
+ studio = cursor.fetchone()
+ if studio:
+ return dict(zip([desc[0] for desc in cursor.description], studio))
+ else:
+ logging.warning(f"未找到制作公司: {identifier}")
+ return None
+ except sqlite3.Error as e:
+ logging.error(f"查询失败: {e}")
+ return None
+
+# 按条件查询 href 列表
+def query_studio_hrefs(**filters):
+ try:
+ sql = "SELECT href FROM studios WHERE 1=1"
+ params = []
+
+ if "id" in filters:
+ sql += " AND id = ?"
+ params.append(filters["id"])
+ if "href" in filters:
+ sql += " AND href = ?"
+ params.append(filters["href"])
+ if "name" in filters:
+ sql += " AND name LIKE ?"
+ params.append(f"%{filters['name']}%")
+
+ cursor.execute(sql, params)
+ return [row[0] for row in cursor.fetchall()]
+
+ except sqlite3.Error as e:
+ logging.error(f"查询 href 失败: {e}")
+ return None
+
+# """从指定表中通过 href 查找 id"""
+def get_id_by_href(table: str, href: str) -> int:
+ cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
+ row = cursor.fetchone()
+ return row[0] if row else None
+
+# """插入或更新电影数据"""
+def insert_or_update_movie(movie_data):
+ try:
+ # 获取相关 ID
+ distributor_id = get_id_by_href('distributors', movie_data['DistributorHref'])
+ studio_id = get_id_by_href('studios', movie_data['StudioHref'])
+ director_id = get_id_by_href('performers', movie_data['DirectorHref'])
+
+ # 插入或更新电影信息
+ cursor.execute(
+ """
+ INSERT INTO movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
+ all_girl, all_male, compilation, webscene, director_id, href, updated_at)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
+ ON CONFLICT(href) DO UPDATE SET
+ title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
+ studio_id=excluded.studio_id, release_date=excluded.release_date,
+ added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
+ all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
+ director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
+ """,
+ (movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
+ movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
+ movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
+ )
+ conn.commit()
+ logging.info("Movie inserted/updated: %s", movie_data['title'])
+
+ # 获取插入的 movie_id
+ cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
+ movie_id = cursor.fetchone()[0]
+
+ # 插入 performers_movies 关系表
+ for performer in movie_data.get('Performers', []):
+ performer_id = get_id_by_href('performers', performer['href'])
+ if performer_id:
+ notes = '|'.join(performer['tags'])
+ cursor.execute(
+ """
+ INSERT INTO performers_movies (performer_id, movie_id, role, notes)
+ VALUES (?, ?, ?, ?)
+ ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
+ """,
+ (performer_id, movie_id, "Actor", notes)
+ )
+ logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
+ else:
+ logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
+
+ # 插入 movies_appers_in 表
+ for appears in movie_data.get("AppearsIn", []):
+ appears_in_id = get_id_by_href('movies', appears['href'])
+ if appears_in_id:
+ appears_in_id = appears_in_id[0]
+ cursor.execute("""
+ INSERT INTO movies_appers_in (movie_id, appears_in_id, gradation, notes)
+ VALUES (?, ?, ?, ?)
+ ON CONFLICT(movie_id, appears_in_id) DO NOTHING
+ """, (movie_id, appears_in_id, 1, appears["title"]))
+ else:
+ logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
+
+ conn.commit()
+ return movie_id
+
+ except Exception as e:
+ conn.rollback()
+ logging.error("Error inserting movie: %s", e)
+ return None
+
+# 删除电影数据"""
+def delete_movie(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("DELETE FROM movies WHERE id = ?", (identifier,))
+ elif isinstance(identifier, str):
+ cursor.execute("DELETE FROM movies WHERE href = ?", (identifier,))
+ else:
+ logging.warning("无效的删除参数")
+ return
+ conn.commit()
+ logging.info(f"Deleted movie with {identifier}")
+
+ except sqlite3.Error as e:
+ conn.rollback()
+ logging.error("Error deleting movie: %s", e)
+
+# 查找电影数据"""
+def query_movies(identifier):
+ try:
+ if isinstance(identifier, int):
+ cursor.execute("SELECT * FROM movies WHERE id = ?", (identifier,))
+ elif "http" in identifier:
+ cursor.execute("SELECT * FROM movies WHERE href = ?", (identifier,))
+ else:
+ cursor.execute("SELECT * FROM movies WHERE title LIKE ?", (f"%{identifier}%",))
+
+ movie = cursor.fetchone()
+ if movie:
+ cursor.execute("SELECT * FROM performer_movie WHERE performer_id = ?", (movie[0],))
+ performers = [row[0] for row in cursor.fetchall()]
+ result = dict(zip([desc[0] for desc in cursor.description], performers))
+ result["performers"] = performers
+ return result
+ else:
+ logging.warning(f"find no data: {identifier}")
+ return None
+
+ except sqlite3.Error as e:
+ logging.error(f"查询失败: {e}")
+ return None
+
+# 按条件查询 href 列表
+def query_movie_hrefs(**filters):
+ try:
+ sql = "SELECT href FROM movies WHERE 1=1"
+ params = []
+
+ if "id" in filters:
+ sql += " AND id = ?"
+ params.append(filters["id"])
+ if "href" in filters:
+ sql += " AND href = ?"
+ params.append(filters["href"])
+ if "title" in filters:
+ sql += " AND title LIKE ?"
+ params.append(f"%{filters['title']}%")
+
+ cursor.execute(sql, params)
+ return [row[0] for row in cursor.fetchall()]
+
+ except sqlite3.Error as e:
+ logging.error(f"查询 href 失败: {e}")
+ return []
+
+
+if __name__ == "__main__":
+
+ try:
+ with open('../result/detail.json', 'r') as file:
+ performers = json.load(file)
+ for performer in performers:
+ insert_or_update_performer(performer)
+
+ print(query_performer("Kirsten"))
+ #delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
+ print(query_performer_hrefs())
+ except FileNotFoundError:
+ logging.info("detail.json not found, starting fresh.")
\ No newline at end of file
diff --git a/scripts/iafd/src/utils.py b/scripts/iafd/src/utils.py
new file mode 100644
index 0000000..7b3cf82
--- /dev/null
+++ b/scripts/iafd/src/utils.py
@@ -0,0 +1,92 @@
+import re
+import os
+import json
+import time
+import csv
+import logging
+
+# 解析 height 和 weight(转换成数字)
+def parse_height(height_str):
+ return 0
+ try:
+ return int(height_str.split("(")[-1].replace(" cm)", ""))
+ except:
+ return None
+
+def parse_weight(weight_str):
+ return 0
+ try:
+ return int(weight_str.split(" ")[0])
+ except:
+ return None
+
+update_dir = '../result'
+performers_dir = f'{update_dir}/performers'
+movies_dir = f'{update_dir}/movies'
+
+def uniq_performers(new_performers):
+ try:
+ if not isinstance(new_performers, list):
+ raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
+
+ seen = set()
+ unique_performers = []
+
+ for item in new_performers:
+ if not item or item['href'] is None:
+ raise ValueError(f"Invalid item in new_performers: {item}")
+
+ if item["href"] not in seen:
+ seen.add(item["href"])
+ unique_performers.append(item)
+
+ return unique_performers
+
+ except Exception as e:
+ logging.error(f"Error in remove_duplicate_performers: {e}")
+ return [] # 返回空列表,避免程序崩溃
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+ # 获取 person 的前两个字母并转为小写
+ sub_dir = str[:1].lower()
+ full_path = os.path.join(base_dir, sub_dir)
+ if not os.path.exists(full_path):
+ os.makedirs(full_path)
+ return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+ """从href中提取id参数"""
+ match = re.search(r'id=([a-f0-9\-]+)', href)
+ return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_person_json(person, href, data):
+ # 获取目录
+ person_dir = create_sub_directory(performers_dir, person)
+ person_id = extract_id_from_href(href)
+ person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
+ full_path = os.path.join(person_dir, person_filename)
+
+ try:
+ with open(full_path, 'w', encoding='utf-8') as json_file:
+ json.dump(data, json_file, indent=4, ensure_ascii=False)
+ except Exception as e:
+ logging.error(f"Error writing file {full_path}: {e}")
+
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+ # 获取目录
+ movie_id = extract_id_from_href(href)
+ person_dir = create_sub_directory(movies_dir, movie_id)
+ person_filename = f"{movie_id}.json" # 用 - 替换空格
+ full_path = os.path.join(person_dir, person_filename)
+
+ try:
+ with open(full_path, 'w', encoding='utf-8') as json_file:
+ json.dump(data, json_file, indent=4, ensure_ascii=False)
+ except Exception as e:
+ logging.error(f"Error writing file {full_path}: {e}")
+