modify some scripts.
This commit is contained in:
@ -12,6 +12,7 @@ scripts/iafd/data/tmp/
|
|||||||
scripts/iafd/result/tmp/
|
scripts/iafd/result/tmp/
|
||||||
scripts/iafd/result/bak/
|
scripts/iafd/result/bak/
|
||||||
scripts/iafd/result/performers/
|
scripts/iafd/result/performers/
|
||||||
|
scripts/iafd/result/movies/
|
||||||
scripts/iafd/log/
|
scripts/iafd/log/
|
||||||
scripts/thelordofporn/log/
|
scripts/thelordofporn/log/
|
||||||
scripts/vixen_group/log/
|
scripts/vixen_group/log/
|
||||||
|
|||||||
@ -0,0 +1,20 @@
|
|||||||
|
{
|
||||||
|
"href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
|
||||||
|
"title": "Barebackin' Men",
|
||||||
|
"Minutes": "No Data",
|
||||||
|
"Distributor": "1 Distribution",
|
||||||
|
"Studio": "1 Distribution",
|
||||||
|
"ReleaseDate": "No Data",
|
||||||
|
"AddedtoIAFDDate": "Jan 1, 2006",
|
||||||
|
"All-Girl": "No",
|
||||||
|
"All-Male": "Yes",
|
||||||
|
"Compilation": "No",
|
||||||
|
"Webscene": "",
|
||||||
|
"Director": "No Data",
|
||||||
|
"DirectorHref": "",
|
||||||
|
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
|
||||||
|
"StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
|
||||||
|
"Performers": [],
|
||||||
|
"SceneBreakdowns": [],
|
||||||
|
"AppearsIn": []
|
||||||
|
}
|
||||||
@ -0,0 +1,56 @@
|
|||||||
|
{
|
||||||
|
"href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
|
||||||
|
"title": "Slim Goodies POV 2",
|
||||||
|
"Minutes": "84",
|
||||||
|
"Distributor": "Exotic Vixen Films",
|
||||||
|
"Studio": "Exotic Vixen Films",
|
||||||
|
"ReleaseDate": "No Data",
|
||||||
|
"AddedtoIAFDDate": "Jan 17, 2024",
|
||||||
|
"All-Girl": "No",
|
||||||
|
"All-Male": "No",
|
||||||
|
"Compilation": "No",
|
||||||
|
"Webscene": "",
|
||||||
|
"Director": "Just Mike Starks",
|
||||||
|
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||||
|
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||||
|
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||||
|
"Performers": [
|
||||||
|
{
|
||||||
|
"name": "Amica Mea",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
|
||||||
|
"tags": [
|
||||||
|
"Amica Mea"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Baby Breezy",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
|
||||||
|
"tags": [
|
||||||
|
"Baby Breezy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Blu Mere",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
|
||||||
|
"tags": [
|
||||||
|
"Blu Mere"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Just Mike Starks",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||||
|
"tags": [
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Mocha Menage",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
|
||||||
|
"tags": [
|
||||||
|
"Mocha Menage"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"SceneBreakdowns": [],
|
||||||
|
"AppearsIn": []
|
||||||
|
}
|
||||||
@ -0,0 +1,70 @@
|
|||||||
|
{
|
||||||
|
"href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
|
||||||
|
"title": "Atlanta U: College Freaks",
|
||||||
|
"Minutes": "No Data",
|
||||||
|
"Distributor": "Exotic Vixen Films",
|
||||||
|
"Studio": "Exotic Vixen Films",
|
||||||
|
"ReleaseDate": "No Data",
|
||||||
|
"AddedtoIAFDDate": "Sep 19, 2020",
|
||||||
|
"All-Girl": "No",
|
||||||
|
"All-Male": "No",
|
||||||
|
"Compilation": "No",
|
||||||
|
"Webscene": "",
|
||||||
|
"Director": "Just Mike Starks",
|
||||||
|
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||||
|
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||||
|
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||||
|
"Performers": [
|
||||||
|
{
|
||||||
|
"name": "Aaliyah Ali",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
|
||||||
|
"tags": [
|
||||||
|
"Aaliyah Ali"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Bones Montana",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
|
||||||
|
"tags": [
|
||||||
|
"Bones Montana"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Cameron Cox",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
|
||||||
|
"tags": [
|
||||||
|
"Cameron Cox"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Crystal Cooper",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
|
||||||
|
"tags": [
|
||||||
|
"Crystal Cooper"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Jazmine Adore",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
|
||||||
|
"tags": [
|
||||||
|
"Jazmine Adore"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Just Mike Starks",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||||
|
"tags": [
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Lala Ivey",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
|
||||||
|
"tags": [
|
||||||
|
"Lala Ivey"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"SceneBreakdowns": [],
|
||||||
|
"AppearsIn": []
|
||||||
|
}
|
||||||
@ -0,0 +1,85 @@
|
|||||||
|
{
|
||||||
|
"href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
|
||||||
|
"title": "Slim Goodies POV",
|
||||||
|
"Minutes": "61",
|
||||||
|
"Distributor": "Exotic Vixen Films",
|
||||||
|
"Studio": "Exotic Vixen Films",
|
||||||
|
"ReleaseDate": "No Data",
|
||||||
|
"AddedtoIAFDDate": "Sep 19, 2020",
|
||||||
|
"All-Girl": "No",
|
||||||
|
"All-Male": "No",
|
||||||
|
"Compilation": "No",
|
||||||
|
"Webscene": "",
|
||||||
|
"Director": "Just Mike Starks",
|
||||||
|
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||||
|
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||||
|
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||||
|
"Performers": [
|
||||||
|
{
|
||||||
|
"name": "Gina Ferrero",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
|
||||||
|
"tags": [
|
||||||
|
"Gina Ferrero"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Imani Reign",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
|
||||||
|
"tags": [
|
||||||
|
"Imani Reign"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Jazmine Adore",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
|
||||||
|
"tags": [
|
||||||
|
"Jazmine Adore"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Just Mike Starks",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||||
|
"tags": [
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Niomie King",
|
||||||
|
"href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
|
||||||
|
"tags": [
|
||||||
|
"Niomie King"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"SceneBreakdowns": [
|
||||||
|
{
|
||||||
|
"scene": "Scene 1",
|
||||||
|
"performers": [
|
||||||
|
"Imani Reign",
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"scene": "Scene 2",
|
||||||
|
"performers": [
|
||||||
|
"Jazmine Adore",
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"scene": "Scene 3",
|
||||||
|
"performers": [
|
||||||
|
"Gina Ferrero",
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"scene": "Scene 4",
|
||||||
|
"performers": [
|
||||||
|
"Niomie King",
|
||||||
|
"Just Mike Starks"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"AppearsIn": []
|
||||||
|
}
|
||||||
26
scripts/iafd/src/config.py
Normal file
26
scripts/iafd/src/config.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import inspect
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
global_share_data_dir = '/root/sharedata'
|
||||||
|
global_host_data_dir = '/root/hostdir/scripts_data'
|
||||||
|
|
||||||
|
# 设置日志配置
|
||||||
|
def setup_logging(log_filename=None):
|
||||||
|
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||||
|
if log_filename is None:
|
||||||
|
# 获取调用 setup_logging 的脚本文件名
|
||||||
|
caller_frame = inspect.stack()[1]
|
||||||
|
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||||
|
|
||||||
|
# 获取当前日期,格式为 yyyymmdd
|
||||||
|
current_date = datetime.now().strftime('%Y%m%d')
|
||||||
|
# 拼接 log 文件名,将日期加在扩展名前
|
||||||
|
log_filename = f'../log/{caller_filename}_{current_date}.log'
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(log_filename),
|
||||||
|
logging.StreamHandler()
|
||||||
|
])
|
||||||
320
scripts/iafd/src/fetch.py
Normal file
320
scripts/iafd/src/fetch.py
Normal file
@ -0,0 +1,320 @@
|
|||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import csv
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from functools import partial
|
||||||
|
import config
|
||||||
|
import sqlite_utils as utils
|
||||||
|
import iafd_scraper as scraper
|
||||||
|
import utils as func
|
||||||
|
|
||||||
|
config.setup_logging()
|
||||||
|
|
||||||
|
debug = True
|
||||||
|
|
||||||
|
# 按星座获取演员列表,无翻页
|
||||||
|
def fetch_performers_by_astro(existed_performer_hrefs):
|
||||||
|
performers = []
|
||||||
|
|
||||||
|
for astro in scraper.astro_list:
|
||||||
|
url = scraper.astr_base_url + astro
|
||||||
|
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||||
|
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||||
|
if list_data:
|
||||||
|
for row in list_data :
|
||||||
|
if row['href'] not in existed_performer_hrefs:
|
||||||
|
performers.append({
|
||||||
|
'person' : row['person'],
|
||||||
|
'href' : row['href']
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
|
||||||
|
# 调试添加break
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
return performers
|
||||||
|
|
||||||
|
|
||||||
|
# 按生日获取演员列表,无翻页
|
||||||
|
def fetch_performers_by_birth(existed_performer_hrefs):
|
||||||
|
performers = []
|
||||||
|
|
||||||
|
for month in range(1, 13): # 遍历1到12月
|
||||||
|
for day in range(1, 32): # 遍历1到31天
|
||||||
|
url = scraper.birth_base_url.format(month=month, day=day)
|
||||||
|
logging.info(f"Fetching data for birth, url {url}")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||||
|
if list_data:
|
||||||
|
for row in list_data :
|
||||||
|
if row['href'] not in existed_performer_hrefs:
|
||||||
|
performers.append({
|
||||||
|
'person' : row['person'],
|
||||||
|
'href' : row['href']
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
|
||||||
|
# 调试添加break
|
||||||
|
if debug:
|
||||||
|
return performers
|
||||||
|
|
||||||
|
return performers
|
||||||
|
|
||||||
|
# 处理带空格的种族名
|
||||||
|
def format_ethnic(ethnic):
|
||||||
|
return ethnic.replace(' ', '+')
|
||||||
|
|
||||||
|
# 按人种获取演员列表,有翻页
|
||||||
|
def fetch_performers_by_ethnic(existed_performer_hrefs):
|
||||||
|
performers = []
|
||||||
|
|
||||||
|
for ethnic in scraper.ethnic_list:
|
||||||
|
url = scraper.ethnic_url + format_ethnic(ethnic)
|
||||||
|
next_url = url
|
||||||
|
|
||||||
|
while next_url:
|
||||||
|
logging.info(f"Fetching data for {ethnic}, url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
||||||
|
parser="lxml", preprocessor=scraper.preprocess_html)
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||||
|
if list_data:
|
||||||
|
for row in list_data :
|
||||||
|
if row['href'] not in existed_performer_hrefs:
|
||||||
|
performers.append({
|
||||||
|
'person' : row['person'],
|
||||||
|
'href' : row['href']
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
|
||||||
|
# 调试添加break
|
||||||
|
if debug:
|
||||||
|
return performers
|
||||||
|
return performers
|
||||||
|
|
||||||
|
|
||||||
|
# 获取distributors列表
|
||||||
|
def fetch_distributors_list(existed_distributors_href):
|
||||||
|
url = scraper.distributors_list_url
|
||||||
|
distributors_list = []
|
||||||
|
|
||||||
|
logging.info(f"Fetching data for distributors list, url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
|
||||||
|
if list_data:
|
||||||
|
for row in list_data :
|
||||||
|
dis_url = scraper.distributors_base_url + row['href']
|
||||||
|
if dis_url in existed_distributors_href :
|
||||||
|
continue
|
||||||
|
distributors_list.append({
|
||||||
|
'name' : row['name'],
|
||||||
|
'href' : dis_url
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
return distributors_list
|
||||||
|
|
||||||
|
# 获取studios列表
|
||||||
|
def fetch_studios_list(existed_studios_href):
|
||||||
|
url = scraper.studios_list_url
|
||||||
|
studios_list = []
|
||||||
|
|
||||||
|
logging.info(f"Fetching data for studios list, url {url} ...")
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
|
||||||
|
if list_data:
|
||||||
|
for row in list_data :
|
||||||
|
stu_url = scraper.studios_base_url + row['href']
|
||||||
|
if stu_url in existed_studios_href:
|
||||||
|
continue
|
||||||
|
studios_list.append({
|
||||||
|
'name' : row['name'],
|
||||||
|
'href' : stu_url
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch astro error. {url} ...')
|
||||||
|
return studios_list
|
||||||
|
|
||||||
|
# 获取更新
|
||||||
|
def check_update():
|
||||||
|
# 读取数据库中的演员列表
|
||||||
|
existed_performer_hrefs = utils.query_performer_hrefs()
|
||||||
|
if not existed_performer_hrefs:
|
||||||
|
logging.warning(f'get existed performers from db error.')
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 从列表页获取新的演员
|
||||||
|
new_performers = []
|
||||||
|
#new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
|
||||||
|
#new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
|
||||||
|
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
|
||||||
|
|
||||||
|
# 逐个获取演员信息,并写入到db中
|
||||||
|
new_performers = list({item["href"]: item for item in new_performers}.values())
|
||||||
|
logging.info(f'get new performers count: {len(new_performers)} ')
|
||||||
|
for performer in new_performers:
|
||||||
|
url = performer['href']
|
||||||
|
person = performer['person']
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
data, credits = scraper.parse_page_performer(soup)
|
||||||
|
if data:
|
||||||
|
performer_id = utils.insert_or_update_performer({
|
||||||
|
'href': url,
|
||||||
|
'person': person,
|
||||||
|
**data
|
||||||
|
})
|
||||||
|
if performer_id:
|
||||||
|
logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert person: {person} {url} failed.')
|
||||||
|
|
||||||
|
# 写入到本地json文件
|
||||||
|
func.write_person_json(person, url, {
|
||||||
|
'href': url,
|
||||||
|
'person': person,
|
||||||
|
**data,
|
||||||
|
'credits': credits if credits else {}
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch_page error. person: {person}, url: {url}')
|
||||||
|
# 调试break
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 从数据库读取distributors列表
|
||||||
|
existed_distributors_href = utils.query_distributor_hrefs()
|
||||||
|
if existed_distributors_href is None:
|
||||||
|
logging.warning(f'get existed distributors from db error.')
|
||||||
|
return
|
||||||
|
new_distributors = fetch_distributors_list(existed_distributors_href)
|
||||||
|
for dist in new_distributors:
|
||||||
|
dist_id = utils.insert_or_update_distributor(dist)
|
||||||
|
if dist_id:
|
||||||
|
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
|
||||||
|
|
||||||
|
# 从数据库读取studios列表
|
||||||
|
existed_studios_href = utils.query_studio_hrefs()
|
||||||
|
if existed_studios_href is None:
|
||||||
|
logging.warning(f'get existed studios from db error.')
|
||||||
|
return
|
||||||
|
new_studios = fetch_studios_list(existed_studios_href)
|
||||||
|
for stu in new_studios:
|
||||||
|
stu_id = utils.insert_or_update_studio(stu)
|
||||||
|
if stu_id:
|
||||||
|
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
|
||||||
|
|
||||||
|
# 从数据库中读取影片列表
|
||||||
|
existed_movies = utils.query_movie_hrefs()
|
||||||
|
if existed_movies is None:
|
||||||
|
logging.warning(f'load movies from db error')
|
||||||
|
return
|
||||||
|
new_movies = []
|
||||||
|
new_movie_hrefs = []
|
||||||
|
|
||||||
|
# 遍历所有 distributors,获取 movies 列表
|
||||||
|
existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
|
||||||
|
if existed_distributors_href is None:
|
||||||
|
logging.warning(f'get existed distributors from db error.')
|
||||||
|
return
|
||||||
|
for url in existed_distributors_href:
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||||
|
if list_data:
|
||||||
|
for movie in list_data:
|
||||||
|
if movie['href'] in existed_movies:
|
||||||
|
continue
|
||||||
|
new_movies.append({
|
||||||
|
'title' : movie['title'],
|
||||||
|
'href' : movie['href']
|
||||||
|
})
|
||||||
|
new_movie_hrefs.append(movie['href'])
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
# 调试增加brak
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
|
||||||
|
|
||||||
|
# 遍历所有 studios,获取 movies 列表
|
||||||
|
existed_studios_href = utils.query_studio_hrefs(name='vixen')
|
||||||
|
if existed_studios_href is None:
|
||||||
|
logging.warning(f'get existed studios from db error.')
|
||||||
|
return
|
||||||
|
for url in existed_studios_href:
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||||
|
if list_data:
|
||||||
|
for movie in list_data:
|
||||||
|
if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
|
||||||
|
continue
|
||||||
|
new_movies.append({
|
||||||
|
'title' : movie['title'],
|
||||||
|
'href' : movie['href']
|
||||||
|
})
|
||||||
|
new_movie_hrefs.append(movie['href'])
|
||||||
|
else :
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
# 调试增加brak
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
|
||||||
|
|
||||||
|
# 对新的影片,逐个获取内容
|
||||||
|
new_movies = list({item["href"]: item for item in new_movies}.values())
|
||||||
|
logging.info(f'get merged new movies, count: {len(new_movies)} ')
|
||||||
|
for movie in new_movies:
|
||||||
|
url = movie['href']
|
||||||
|
title = movie['title']
|
||||||
|
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||||
|
if soup:
|
||||||
|
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||||
|
if movie_data :
|
||||||
|
movie_id = utils.insert_or_update_movie(movie_data)
|
||||||
|
if movie_id:
|
||||||
|
logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'insert movie {url} failed.')
|
||||||
|
|
||||||
|
# 写入到本地json文件
|
||||||
|
func.write_movie_json(url, movie_data)
|
||||||
|
else:
|
||||||
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'fetch_page error. url: {url}')
|
||||||
|
# 调试增加break
|
||||||
|
if debug:
|
||||||
|
break
|
||||||
|
|
||||||
|
logging.info(f'all process completed!')
|
||||||
|
if __name__ == "__main__":
|
||||||
|
check_update()
|
||||||
513
scripts/iafd/src/iafd_scraper.py
Normal file
513
scripts/iafd/src/iafd_scraper.py
Normal file
@ -0,0 +1,513 @@
|
|||||||
|
|
||||||
|
import cloudscraper
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from requests.exceptions import RequestException
|
||||||
|
from functools import partial
|
||||||
|
import config
|
||||||
|
|
||||||
|
# 定义基础 URL 和可变参数
|
||||||
|
host_url = "https://www.iafd.com"
|
||||||
|
|
||||||
|
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||||
|
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||||
|
|
||||||
|
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||||
|
|
||||||
|
ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
|
||||||
|
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
|
||||||
|
|
||||||
|
distributors_list_url = f'{host_url}/distrib.asp'
|
||||||
|
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||||||
|
|
||||||
|
studios_list_url = f"{host_url}/studio.asp"
|
||||||
|
studios_base_url = f"{host_url}/studio.rme/studio="
|
||||||
|
|
||||||
|
# 设置 headers 和 scraper
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||||
|
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = scraper.get(url, headers=headers)
|
||||||
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_text, parser)
|
||||||
|
if validator(soup): # 进行自定义页面检查
|
||||||
|
return soup
|
||||||
|
|
||||||
|
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||||
|
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||||
|
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||||
|
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||||
|
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||||
|
|
||||||
|
logging.error(f'Fetching failed after max retries. {url}')
|
||||||
|
return None # 达到最大重试次数仍然失败
|
||||||
|
|
||||||
|
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||||
|
def preprocess_html(html):
|
||||||
|
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||||
|
|
||||||
|
# 通用的 HTML 结构验证器
|
||||||
|
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||||
|
if attr_type == "id":
|
||||||
|
return soup.find(tag, id=identifier) is not None
|
||||||
|
elif attr_type == "class":
|
||||||
|
return bool(soup.find_all(tag, class_=identifier))
|
||||||
|
elif attr_type == "name":
|
||||||
|
return bool(soup.find('select', {'name': identifier}))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 检查电影信息是否存在
|
||||||
|
def movie_validator(soup, table_id):
|
||||||
|
return soup.find("table", id=table_id) is not None
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_astro(soup, astro):
|
||||||
|
astro_div = soup.find("div", id="astro")
|
||||||
|
if not astro_div:
|
||||||
|
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
flag = False
|
||||||
|
list_cnt = 0
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
|
||||||
|
birth_date = None
|
||||||
|
for elem in astro_div.find_all(recursive=False):
|
||||||
|
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||||||
|
birth_date = elem.get_text(strip=True)
|
||||||
|
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||||||
|
a_tag = elem.find("a")
|
||||||
|
if a_tag:
|
||||||
|
href = host_url + a_tag["href"]
|
||||||
|
name = a_tag.find("span", class_="perfname")
|
||||||
|
if name:
|
||||||
|
list_data.append({
|
||||||
|
"astrology": astro,
|
||||||
|
"birth_date": birth_date,
|
||||||
|
"person": name.get_text(strip=True),
|
||||||
|
"href": href
|
||||||
|
})
|
||||||
|
flag = True
|
||||||
|
list_cnt = list_cnt +1
|
||||||
|
if flag:
|
||||||
|
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# 解析页面内容并更新birth_map
|
||||||
|
def parse_page_birth(soup, month, day):
|
||||||
|
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||||||
|
if not datarows:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
flag = False
|
||||||
|
list_cnt = 0
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||||||
|
for row in rows:
|
||||||
|
link_tag = row.find('a')
|
||||||
|
person = link_tag.text.strip() if link_tag else ''
|
||||||
|
href = link_tag['href'] if link_tag else ''
|
||||||
|
href = host_url + href
|
||||||
|
|
||||||
|
# 如果 href 已经在 birth_map 中,跳过
|
||||||
|
flag = True
|
||||||
|
if any(entry['href'] == href for entry in list_data):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 将数据添加到 birth_map
|
||||||
|
list_data.append({
|
||||||
|
'month': month,
|
||||||
|
'day': day,
|
||||||
|
'person': person,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
list_cnt = list_cnt +1
|
||||||
|
|
||||||
|
if flag:
|
||||||
|
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_ethnic(soup, ethnic):
|
||||||
|
rows = soup.find_all('div', class_='row headshotrow')
|
||||||
|
flag = False
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||||||
|
link_tag = col.find('a')
|
||||||
|
img_tag = col.find('div', class_='pictag')
|
||||||
|
flag = True
|
||||||
|
|
||||||
|
if link_tag and img_tag:
|
||||||
|
href = host_url + link_tag['href']
|
||||||
|
person = img_tag.text.strip()
|
||||||
|
|
||||||
|
# 将数据存储到 ethnic_map
|
||||||
|
list_data.append({
|
||||||
|
'ethnic': ethnic,
|
||||||
|
'person': person,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
if flag:
|
||||||
|
logging.debug(f"get {len(list_data)} persons from this page.")
|
||||||
|
|
||||||
|
next_page = soup.find('a', rel='next')
|
||||||
|
if next_page:
|
||||||
|
next_url = host_url + next_page['href']
|
||||||
|
logging.debug(f"Found next page: {next_url}")
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
logging.debug(f"All pages fetched for {ethnic}.")
|
||||||
|
return list_data, None
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析列表页
|
||||||
|
def parse_page_dist_stu_list(soup, select_name):
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
|
||||||
|
select_element = soup.find('select', {'name': select_name})
|
||||||
|
if select_element :
|
||||||
|
options = select_element.find_all('option')
|
||||||
|
for option in options:
|
||||||
|
value = option.get('value') # 获取 value 属性
|
||||||
|
text = option.text.strip() # 获取文本内容
|
||||||
|
list_data.append({
|
||||||
|
'name' : text,
|
||||||
|
'href' : str(value)
|
||||||
|
})
|
||||||
|
return list_data, next_url
|
||||||
|
else:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 解析 HTML 内容,提取需要的数据
|
||||||
|
def parse_page_dist_stu(soup, table_id):
|
||||||
|
table = soup.find("table", id=table_id)
|
||||||
|
if not table:
|
||||||
|
logging.warning(f"Warning: No {table_id} table found ")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# 找到thead并跳过
|
||||||
|
thead = table.find('thead')
|
||||||
|
if thead:
|
||||||
|
thead.decompose() # 去掉thead部分,不需要解析
|
||||||
|
|
||||||
|
# 现在只剩下tbody部分
|
||||||
|
tbody = table.find('tbody')
|
||||||
|
rows = tbody.find_all('tr') if tbody else []
|
||||||
|
|
||||||
|
list_data = []
|
||||||
|
next_url = None
|
||||||
|
for row in rows:
|
||||||
|
cols = row.find_all('td')
|
||||||
|
if len(cols) >= 5:
|
||||||
|
title = cols[0].text.strip()
|
||||||
|
label = cols[1].text.strip()
|
||||||
|
year = cols[2].text.strip()
|
||||||
|
rev = cols[3].text.strip()
|
||||||
|
a_href = cols[0].find('a')
|
||||||
|
href = host_url + a_href['href'] if a_href else ''
|
||||||
|
|
||||||
|
list_data.append({
|
||||||
|
'title': title,
|
||||||
|
'label': label,
|
||||||
|
'year': year,
|
||||||
|
'rev': rev,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
return list_data, next_url
|
||||||
|
|
||||||
|
|
||||||
|
# 解析 作品列表,有个人出演,也有导演的
|
||||||
|
def parse_credits_table(table, distributor_list):
|
||||||
|
# 找到thead并跳过
|
||||||
|
thead = table.find('thead')
|
||||||
|
if thead:
|
||||||
|
thead.decompose() # 去掉thead部分,不需要解析
|
||||||
|
|
||||||
|
# 现在只剩下tbody部分
|
||||||
|
tbody = table.find('tbody')
|
||||||
|
rows = tbody.find_all('tr') if tbody else []
|
||||||
|
|
||||||
|
movies = []
|
||||||
|
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||||
|
|
||||||
|
# rows = table.find_all('tr', class_='we')
|
||||||
|
for row in rows:
|
||||||
|
cols = row.find_all('td')
|
||||||
|
if len(cols) >= 6:
|
||||||
|
title = cols[0].text.strip()
|
||||||
|
year = cols[1].text.strip()
|
||||||
|
distributor = cols[2].text.strip().lower()
|
||||||
|
notes = cols[3].text.strip()
|
||||||
|
rev = cols[4].text.strip()
|
||||||
|
formats = cols[5].text.strip()
|
||||||
|
|
||||||
|
for key in distributor_list:
|
||||||
|
if key in distributor:
|
||||||
|
distributor_count[key] += 1
|
||||||
|
|
||||||
|
movies.append({
|
||||||
|
'title': title,
|
||||||
|
'year': year,
|
||||||
|
'distributor': distributor,
|
||||||
|
'notes': notes,
|
||||||
|
'rev': rev,
|
||||||
|
'formats': formats
|
||||||
|
})
|
||||||
|
return movies, distributor_count
|
||||||
|
|
||||||
|
|
||||||
|
# 请求网页并提取所需数据
|
||||||
|
def parse_page_performer(soup):
|
||||||
|
# 提取数据
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# 定义我们需要的字段名称和HTML中对应的标签
|
||||||
|
fields = {
|
||||||
|
'performer_aka': 'Performer AKA',
|
||||||
|
'birthday': 'Birthday',
|
||||||
|
'astrology': 'Astrology',
|
||||||
|
'birthplace': 'Birthplace',
|
||||||
|
'gender': 'Gender',
|
||||||
|
'years_active': 'Years Active',
|
||||||
|
'ethnicity': 'Ethnicity',
|
||||||
|
'nationality': 'Nationality',
|
||||||
|
'hair_colors': 'Hair Colors',
|
||||||
|
'eye_color': 'Eye Color',
|
||||||
|
'height': 'Height',
|
||||||
|
'weight': 'Weight',
|
||||||
|
'measurements': 'Measurements',
|
||||||
|
'tattoos': 'Tattoos',
|
||||||
|
'piercings': 'Piercings'
|
||||||
|
}
|
||||||
|
reversed_map = {v: k for k, v in fields.items()}
|
||||||
|
|
||||||
|
# 解析表格数据, 获取参演或者导演的列表
|
||||||
|
role_list = ['personal', 'directoral']
|
||||||
|
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||||
|
credits_list = {}
|
||||||
|
|
||||||
|
# 使用字典来存储统计
|
||||||
|
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||||
|
for role in role_list:
|
||||||
|
table = soup.find('table', id=role)
|
||||||
|
if table :
|
||||||
|
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||||
|
credits_list[role] = movies
|
||||||
|
# 更新 distributor 统计
|
||||||
|
for distributor in distributor_list:
|
||||||
|
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||||
|
|
||||||
|
# 统计 movies 数量
|
||||||
|
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||||
|
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||||
|
|
||||||
|
# 如果没有找到
|
||||||
|
if len(credits_list) == 0 :
|
||||||
|
logging.warning(f"movie table empty. url: {url} ")
|
||||||
|
|
||||||
|
# 遍历每个 bioheading, 获取metadata
|
||||||
|
bioheadings = soup.find_all('p', class_='bioheading')
|
||||||
|
for bio in bioheadings:
|
||||||
|
heading = bio.text.strip()
|
||||||
|
biodata = None
|
||||||
|
|
||||||
|
# 如果包含 "Performer",需要特殊处理
|
||||||
|
if 'Performer' in heading:
|
||||||
|
heading = 'Performer AKA'
|
||||||
|
biodata_div = bio.find_next('div', class_='biodata')
|
||||||
|
if biodata_div:
|
||||||
|
div_text = biodata_div.get_text(separator='|').strip()
|
||||||
|
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||||||
|
else:
|
||||||
|
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||||||
|
|
||||||
|
# 保存数据
|
||||||
|
if heading in reversed_map:
|
||||||
|
kkey = reversed_map[heading]
|
||||||
|
data[kkey] = biodata
|
||||||
|
|
||||||
|
# 添加统计数据到 data
|
||||||
|
data['movies_cnt'] = movies_cnt
|
||||||
|
data['vixen_cnt'] = distributor_count['vixen']
|
||||||
|
data['blacked_cnt'] = distributor_count['blacked']
|
||||||
|
data['tushy_cnt'] = distributor_count['tushy']
|
||||||
|
data['x_art_cnt'] = distributor_count['x-art']
|
||||||
|
|
||||||
|
return data, credits_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 解析网页 HTML 并提取电影信息
|
||||||
|
def parse_page_movie(soup, href, title):
|
||||||
|
# 解析电影基础信息
|
||||||
|
movie_data = {}
|
||||||
|
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||||
|
if info_div:
|
||||||
|
labels = info_div.find_all("p", class_="bioheading")
|
||||||
|
values = info_div.find_all("p", class_="biodata")
|
||||||
|
for label, value in zip(labels, values):
|
||||||
|
key = label.text.strip()
|
||||||
|
val = value.text.strip()
|
||||||
|
if key in ["Distributor", "Studio", "Director"]:
|
||||||
|
link = value.find("a")
|
||||||
|
if link:
|
||||||
|
val = link.text.strip()
|
||||||
|
movie_data[f'{key}Href'] = host_url + link['href']
|
||||||
|
movie_data[key] = val
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 解析演职人员信息
|
||||||
|
performers = []
|
||||||
|
cast_divs = soup.find_all("div", class_="castbox")
|
||||||
|
for cast in cast_divs:
|
||||||
|
performer = {}
|
||||||
|
link = cast.find("a")
|
||||||
|
if link:
|
||||||
|
performer["name"] = link.text.strip()
|
||||||
|
performer["href"] = host_url + link["href"]
|
||||||
|
|
||||||
|
performer["tags"] = [
|
||||||
|
tag.strip() for br in cast.find_all("br")
|
||||||
|
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||||
|
]
|
||||||
|
|
||||||
|
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||||
|
performers.append(performer)
|
||||||
|
|
||||||
|
# 解析场景拆解
|
||||||
|
scene_breakdowns = []
|
||||||
|
scene_table = soup.find("div", id="sceneinfo")
|
||||||
|
if scene_table:
|
||||||
|
rows = scene_table.find_all("tr")
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
cols = row.find_all("td")
|
||||||
|
if len(cols) >= 2:
|
||||||
|
scene = cols[0].text.strip() # 场景编号
|
||||||
|
performer_info = cols[1] # 包含表演者及链接信息
|
||||||
|
|
||||||
|
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||||
|
performer_html = str(performer_info) # 获取所有HTML内容
|
||||||
|
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||||
|
if split_html:
|
||||||
|
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||||
|
else:
|
||||||
|
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||||
|
if split_html:
|
||||||
|
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||||
|
else:
|
||||||
|
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||||
|
|
||||||
|
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||||
|
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||||
|
performers_text = performers_soup.get_text()
|
||||||
|
|
||||||
|
# 提取表演者
|
||||||
|
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||||
|
|
||||||
|
# 尝试获取 `webscene` 和 `studio`
|
||||||
|
links_data = {}
|
||||||
|
links = performer_info.find_all("a")
|
||||||
|
if links:
|
||||||
|
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||||
|
webscene = links[0]["href"] if len(links)>0 else None
|
||||||
|
studio = links[1].text.strip() if len(links)>1 else None
|
||||||
|
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||||
|
links_data = {
|
||||||
|
"title": webscene_title,
|
||||||
|
"webscene": webscene,
|
||||||
|
"studio": studio,
|
||||||
|
"studio_lnk": studio_lnk,
|
||||||
|
}
|
||||||
|
|
||||||
|
scene_data = {
|
||||||
|
"scene": scene,
|
||||||
|
"performers": scene_performers,
|
||||||
|
**links_data,
|
||||||
|
}
|
||||||
|
scene_breakdowns.append(scene_data)
|
||||||
|
|
||||||
|
appears_in = []
|
||||||
|
appears_divs = soup.find("div", id="appearssection")
|
||||||
|
if appears_divs:
|
||||||
|
rows = appears_divs.find_all("li")
|
||||||
|
for row in rows:
|
||||||
|
lnk = row.find("a")
|
||||||
|
if lnk:
|
||||||
|
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||||
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
"href": href,
|
||||||
|
"title": title,
|
||||||
|
"Minutes": movie_data.get("Minutes", ""),
|
||||||
|
"Distributor": movie_data.get("Distributor", ""),
|
||||||
|
"Studio": movie_data.get("Studio", ""),
|
||||||
|
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||||
|
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||||
|
"All-Girl": movie_data.get("All-Girl", ""),
|
||||||
|
"All-Male": movie_data.get("All-Male", ""),
|
||||||
|
"Compilation": movie_data.get("Compilation", ""),
|
||||||
|
"Webscene": movie_data.get("Webscene", ""),
|
||||||
|
"Director": movie_data.get("Director", ""),
|
||||||
|
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||||
|
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||||
|
"StudioHref": movie_data.get("StudioHref", ""),
|
||||||
|
"Performers": performers,
|
||||||
|
"SceneBreakdowns": scene_breakdowns,
|
||||||
|
"AppearsIn": appears_in,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
for astro in astro_list:
|
||||||
|
url = astr_base_url + astro
|
||||||
|
next_url = url
|
||||||
|
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||||
|
if soup:
|
||||||
|
list_data, next_url = parse_page_astro(soup, astro)
|
||||||
|
if list_data:
|
||||||
|
print(list_data[0] if len(list_data)>0 else 'no data')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.info(f"Retrying {next_url} ...")
|
||||||
|
time.sleep(5) # 等待后再重试
|
||||||
|
|
||||||
|
time.sleep(2) # 控制访问频率
|
||||||
459
scripts/iafd/src/sqlite_utils.py
Normal file
459
scripts/iafd/src/sqlite_utils.py
Normal file
@ -0,0 +1,459 @@
|
|||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import config
|
||||||
|
import utils
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# 连接 SQLite 数据库
|
||||||
|
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# 获取当前时间
|
||||||
|
def get_current_time():
|
||||||
|
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
# 插入演员信息
|
||||||
|
def insert_or_update_performer(data):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
|
||||||
|
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
|
||||||
|
blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
gender = excluded.gender,
|
||||||
|
birthday = excluded.birthday,
|
||||||
|
astrology = excluded.astrology,
|
||||||
|
birthplace = excluded.birthplace,
|
||||||
|
years_active = excluded.years_active,
|
||||||
|
ethnicity = excluded.ethnicity,
|
||||||
|
nationality = excluded.nationality,
|
||||||
|
hair_colors = excluded.hair_colors,
|
||||||
|
eye_color = excluded.eye_color,
|
||||||
|
height_str = excluded.height_str,
|
||||||
|
weight_str = excluded.weight_str,
|
||||||
|
measurements = excluded.measurements,
|
||||||
|
tattoos = excluded.tattoos,
|
||||||
|
piercings = excluded.piercings,
|
||||||
|
weight = excluded.weight,
|
||||||
|
height = excluded.height,
|
||||||
|
movies_cnt = excluded.movies_cnt,
|
||||||
|
vixen_cnt = excluded.vixen_cnt,
|
||||||
|
blacked_cnt = excluded.blacked_cnt,
|
||||||
|
tushy_cnt = excluded.tushy_cnt,
|
||||||
|
x_art_cnt = excluded.x_art_cnt,
|
||||||
|
updated_at = datetime('now', 'localtime')
|
||||||
|
""", (
|
||||||
|
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
|
||||||
|
data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
|
||||||
|
data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')),
|
||||||
|
data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
|
||||||
|
))
|
||||||
|
|
||||||
|
# 获取 performer_id
|
||||||
|
cursor.execute("SELECT id FROM performers WHERE href = ?", (data["href"],))
|
||||||
|
performer_id = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
# 删除旧的 alias
|
||||||
|
cursor.execute("DELETE FROM performer_aliases WHERE performer_id = ?", (performer_id,))
|
||||||
|
|
||||||
|
# 插入新的 alias
|
||||||
|
for alias in data.get("performer_aka", []):
|
||||||
|
if alias.lower() != "no known aliases":
|
||||||
|
cursor.execute("INSERT INTO performer_aliases (performer_id, alias) VALUES (?, ?)", (performer_id, alias))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
logging.debug(f"成功插入/更新演员: {data['person']}")
|
||||||
|
return performer_id
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"未知错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按 id 或 href 删除演员
|
||||||
|
def delete_performer(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM performers WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM performers WHERE href = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
logging.warning("无效的删除参数")
|
||||||
|
return
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"成功删除演员: {identifier}")
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"删除失败: {e}")
|
||||||
|
|
||||||
|
# 按 id、href 或 name 查询演员信息
|
||||||
|
def query_performer(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM performers WHERE id = ?", (identifier,))
|
||||||
|
elif "http" in identifier:
|
||||||
|
cursor.execute("SELECT * FROM performers WHERE href = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM performers WHERE name LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
performer = cursor.fetchone()
|
||||||
|
if performer:
|
||||||
|
cursor.execute("SELECT alias FROM performer_aliases WHERE performer_id = ?", (performer[0],))
|
||||||
|
aliases = [row[0] for row in cursor.fetchall()]
|
||||||
|
result = dict(zip([desc[0] for desc in cursor.description], performer))
|
||||||
|
result["performer_aka"] = aliases
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
logging.warning(f"未找到演员: {identifier}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_performer_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href FROM performers WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "href" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "name" in filters:
|
||||||
|
sql += " AND name LIKE ?"
|
||||||
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
return [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# 插入或更新发行商 """
|
||||||
|
def insert_or_update_distributor(data):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO distributors (name, href, updated_at)
|
||||||
|
VALUES (?, ? , datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
updated_at = datetime('now', 'localtime')
|
||||||
|
""", (data["name"], data["href"]))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取 performer_id
|
||||||
|
cursor.execute("SELECT id FROM distributors WHERE href = ?", (data["href"],))
|
||||||
|
dist_id = cursor.fetchone()[0]
|
||||||
|
if dist_id:
|
||||||
|
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||||
|
return dist_id
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 删除发行商(按 id 或 name) """
|
||||||
|
def delete_distributor(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM distributors WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM distributors WHERE name = ?", (identifier,))
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"成功删除发行商: {identifier}")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"删除失败: {e}")
|
||||||
|
|
||||||
|
# 查询发行商(按 id 或 name) """
|
||||||
|
def query_distributor(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM distributors WHERE id = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM distributors WHERE name LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
distributor = cursor.fetchone()
|
||||||
|
if distributor:
|
||||||
|
return dict(zip([desc[0] for desc in cursor.description], distributor))
|
||||||
|
else:
|
||||||
|
logging.warning(f"未找到发行商: {identifier}")
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_distributor_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href FROM distributors WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "url" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "name" in filters:
|
||||||
|
sql += " AND name LIKE ?"
|
||||||
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
return [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# """ 插入或更新制作公司 """
|
||||||
|
def insert_or_update_studio(data):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO studios (name, href, updated_at)
|
||||||
|
VALUES (?, ?, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
updated_at = datetime('now', 'localtime')
|
||||||
|
""", (data["name"], data["href"]))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 获取 performer_id
|
||||||
|
cursor.execute("SELECT id FROM studios WHERE href = ?", (data["href"],))
|
||||||
|
stu_id = cursor.fetchone()[0]
|
||||||
|
if stu_id:
|
||||||
|
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||||
|
return stu_id
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"数据库错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# """ 删除制作公司(按 id 或 name) """
|
||||||
|
def delete_studio(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM studios WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM studios WHERE name = ?", (identifier,))
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"成功删除制作公司: {identifier}")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error(f"删除失败: {e}")
|
||||||
|
|
||||||
|
# """ 查询制作公司(按 id 或 name) """
|
||||||
|
def query_studio(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM studios WHERE id = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM studios WHERE name LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
studio = cursor.fetchone()
|
||||||
|
if studio:
|
||||||
|
return dict(zip([desc[0] for desc in cursor.description], studio))
|
||||||
|
else:
|
||||||
|
logging.warning(f"未找到制作公司: {identifier}")
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_studio_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href FROM studios WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "href" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "name" in filters:
|
||||||
|
sql += " AND name LIKE ?"
|
||||||
|
params.append(f"%{filters['name']}%")
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
return [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# """从指定表中通过 href 查找 id"""
|
||||||
|
def get_id_by_href(table: str, href: str) -> int:
|
||||||
|
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
|
# """插入或更新电影数据"""
|
||||||
|
def insert_or_update_movie(movie_data):
|
||||||
|
try:
|
||||||
|
# 获取相关 ID
|
||||||
|
distributor_id = get_id_by_href('distributors', movie_data['DistributorHref'])
|
||||||
|
studio_id = get_id_by_href('studios', movie_data['StudioHref'])
|
||||||
|
director_id = get_id_by_href('performers', movie_data['DirectorHref'])
|
||||||
|
|
||||||
|
# 插入或更新电影信息
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
|
||||||
|
all_girl, all_male, compilation, webscene, director_id, href, updated_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||||
|
ON CONFLICT(href) DO UPDATE SET
|
||||||
|
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
|
||||||
|
studio_id=excluded.studio_id, release_date=excluded.release_date,
|
||||||
|
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
|
||||||
|
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
|
||||||
|
director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
|
||||||
|
""",
|
||||||
|
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
|
||||||
|
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
|
||||||
|
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
logging.info("Movie inserted/updated: %s", movie_data['title'])
|
||||||
|
|
||||||
|
# 获取插入的 movie_id
|
||||||
|
cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
|
||||||
|
movie_id = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
# 插入 performers_movies 关系表
|
||||||
|
for performer in movie_data.get('Performers', []):
|
||||||
|
performer_id = get_id_by_href('performers', performer['href'])
|
||||||
|
if performer_id:
|
||||||
|
notes = '|'.join(performer['tags'])
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO performers_movies (performer_id, movie_id, role, notes)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
|
||||||
|
""",
|
||||||
|
(performer_id, movie_id, "Actor", notes)
|
||||||
|
)
|
||||||
|
logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
|
||||||
|
else:
|
||||||
|
logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
||||||
|
|
||||||
|
# 插入 movies_appers_in 表
|
||||||
|
for appears in movie_data.get("AppearsIn", []):
|
||||||
|
appears_in_id = get_id_by_href('movies', appears['href'])
|
||||||
|
if appears_in_id:
|
||||||
|
appears_in_id = appears_in_id[0]
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT(movie_id, appears_in_id) DO NOTHING
|
||||||
|
""", (movie_id, appears_in_id, 1, appears["title"]))
|
||||||
|
else:
|
||||||
|
logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
return movie_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error inserting movie: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 删除电影数据"""
|
||||||
|
def delete_movie(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("DELETE FROM movies WHERE id = ?", (identifier,))
|
||||||
|
elif isinstance(identifier, str):
|
||||||
|
cursor.execute("DELETE FROM movies WHERE href = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
logging.warning("无效的删除参数")
|
||||||
|
return
|
||||||
|
conn.commit()
|
||||||
|
logging.info(f"Deleted movie with {identifier}")
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
conn.rollback()
|
||||||
|
logging.error("Error deleting movie: %s", e)
|
||||||
|
|
||||||
|
# 查找电影数据"""
|
||||||
|
def query_movies(identifier):
|
||||||
|
try:
|
||||||
|
if isinstance(identifier, int):
|
||||||
|
cursor.execute("SELECT * FROM movies WHERE id = ?", (identifier,))
|
||||||
|
elif "http" in identifier:
|
||||||
|
cursor.execute("SELECT * FROM movies WHERE href = ?", (identifier,))
|
||||||
|
else:
|
||||||
|
cursor.execute("SELECT * FROM movies WHERE title LIKE ?", (f"%{identifier}%",))
|
||||||
|
|
||||||
|
movie = cursor.fetchone()
|
||||||
|
if movie:
|
||||||
|
cursor.execute("SELECT * FROM performer_movie WHERE performer_id = ?", (movie[0],))
|
||||||
|
performers = [row[0] for row in cursor.fetchall()]
|
||||||
|
result = dict(zip([desc[0] for desc in cursor.description], performers))
|
||||||
|
result["performers"] = performers
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
logging.warning(f"find no data: {identifier}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def query_movie_hrefs(**filters):
|
||||||
|
try:
|
||||||
|
sql = "SELECT href FROM movies WHERE 1=1"
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if "id" in filters:
|
||||||
|
sql += " AND id = ?"
|
||||||
|
params.append(filters["id"])
|
||||||
|
if "href" in filters:
|
||||||
|
sql += " AND href = ?"
|
||||||
|
params.append(filters["href"])
|
||||||
|
if "title" in filters:
|
||||||
|
sql += " AND title LIKE ?"
|
||||||
|
params.append(f"%{filters['title']}%")
|
||||||
|
|
||||||
|
cursor.execute(sql, params)
|
||||||
|
return [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open('../result/detail.json', 'r') as file:
|
||||||
|
performers = json.load(file)
|
||||||
|
for performer in performers:
|
||||||
|
insert_or_update_performer(performer)
|
||||||
|
|
||||||
|
print(query_performer("Kirsten"))
|
||||||
|
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
|
||||||
|
print(query_performer_hrefs())
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.info("detail.json not found, starting fresh.")
|
||||||
92
scripts/iafd/src/utils.py
Normal file
92
scripts/iafd/src/utils.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import re
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# 解析 height 和 weight(转换成数字)
|
||||||
|
def parse_height(height_str):
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
return int(height_str.split("(")[-1].replace(" cm)", ""))
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_weight(weight_str):
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
return int(weight_str.split(" ")[0])
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
update_dir = '../result'
|
||||||
|
performers_dir = f'{update_dir}/performers'
|
||||||
|
movies_dir = f'{update_dir}/movies'
|
||||||
|
|
||||||
|
def uniq_performers(new_performers):
|
||||||
|
try:
|
||||||
|
if not isinstance(new_performers, list):
|
||||||
|
raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
unique_performers = []
|
||||||
|
|
||||||
|
for item in new_performers:
|
||||||
|
if not item or item['href'] is None:
|
||||||
|
raise ValueError(f"Invalid item in new_performers: {item}")
|
||||||
|
|
||||||
|
if item["href"] not in seen:
|
||||||
|
seen.add(item["href"])
|
||||||
|
unique_performers.append(item)
|
||||||
|
|
||||||
|
return unique_performers
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error in remove_duplicate_performers: {e}")
|
||||||
|
return [] # 返回空列表,避免程序崩溃
|
||||||
|
|
||||||
|
# 创建目录
|
||||||
|
def create_sub_directory(base_dir, str):
|
||||||
|
# 获取 person 的前两个字母并转为小写
|
||||||
|
sub_dir = str[:1].lower()
|
||||||
|
full_path = os.path.join(base_dir, sub_dir)
|
||||||
|
if not os.path.exists(full_path):
|
||||||
|
os.makedirs(full_path)
|
||||||
|
return full_path
|
||||||
|
|
||||||
|
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||||
|
def extract_id_from_href(href):
|
||||||
|
"""从href中提取id参数"""
|
||||||
|
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||||
|
return match.group(1) if match else ''
|
||||||
|
|
||||||
|
# 写入每个 performer 的单独 JSON 文件
|
||||||
|
def write_person_json(person, href, data):
|
||||||
|
# 获取目录
|
||||||
|
person_dir = create_sub_directory(performers_dir, person)
|
||||||
|
person_id = extract_id_from_href(href)
|
||||||
|
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(person_dir, person_filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error writing file {full_path}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# 写入每个 performer 的单独 JSON 文件
|
||||||
|
def write_movie_json(href, data):
|
||||||
|
# 获取目录
|
||||||
|
movie_id = extract_id_from_href(href)
|
||||||
|
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||||
|
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||||
|
full_path = os.path.join(person_dir, person_filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error writing file {full_path}: {e}")
|
||||||
|
|
||||||
Reference in New Issue
Block a user