modify some scripts.
This commit is contained in:
@ -0,0 +1,20 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
|
||||
"title": "Barebackin' Men",
|
||||
"Minutes": "No Data",
|
||||
"Distributor": "1 Distribution",
|
||||
"Studio": "1 Distribution",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Jan 1, 2006",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "Yes",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "No Data",
|
||||
"DirectorHref": "",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
|
||||
"Performers": [],
|
||||
"SceneBreakdowns": [],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -0,0 +1,56 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
|
||||
"title": "Slim Goodies POV 2",
|
||||
"Minutes": "84",
|
||||
"Distributor": "Exotic Vixen Films",
|
||||
"Studio": "Exotic Vixen Films",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Jan 17, 2024",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "No",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "Just Mike Starks",
|
||||
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"Performers": [
|
||||
{
|
||||
"name": "Amica Mea",
|
||||
"href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
|
||||
"tags": [
|
||||
"Amica Mea"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Baby Breezy",
|
||||
"href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
|
||||
"tags": [
|
||||
"Baby Breezy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Blu Mere",
|
||||
"href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
|
||||
"tags": [
|
||||
"Blu Mere"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Just Mike Starks",
|
||||
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"tags": [
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Mocha Menage",
|
||||
"href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
|
||||
"tags": [
|
||||
"Mocha Menage"
|
||||
]
|
||||
}
|
||||
],
|
||||
"SceneBreakdowns": [],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -0,0 +1,70 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
|
||||
"title": "Atlanta U: College Freaks",
|
||||
"Minutes": "No Data",
|
||||
"Distributor": "Exotic Vixen Films",
|
||||
"Studio": "Exotic Vixen Films",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Sep 19, 2020",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "No",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "Just Mike Starks",
|
||||
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"Performers": [
|
||||
{
|
||||
"name": "Aaliyah Ali",
|
||||
"href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
|
||||
"tags": [
|
||||
"Aaliyah Ali"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Bones Montana",
|
||||
"href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
|
||||
"tags": [
|
||||
"Bones Montana"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Cameron Cox",
|
||||
"href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
|
||||
"tags": [
|
||||
"Cameron Cox"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Crystal Cooper",
|
||||
"href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
|
||||
"tags": [
|
||||
"Crystal Cooper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jazmine Adore",
|
||||
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
|
||||
"tags": [
|
||||
"Jazmine Adore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Just Mike Starks",
|
||||
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"tags": [
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Lala Ivey",
|
||||
"href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
|
||||
"tags": [
|
||||
"Lala Ivey"
|
||||
]
|
||||
}
|
||||
],
|
||||
"SceneBreakdowns": [],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -0,0 +1,85 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
|
||||
"title": "Slim Goodies POV",
|
||||
"Minutes": "61",
|
||||
"Distributor": "Exotic Vixen Films",
|
||||
"Studio": "Exotic Vixen Films",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Sep 19, 2020",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "No",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "Just Mike Starks",
|
||||
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"Performers": [
|
||||
{
|
||||
"name": "Gina Ferrero",
|
||||
"href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
|
||||
"tags": [
|
||||
"Gina Ferrero"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Imani Reign",
|
||||
"href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
|
||||
"tags": [
|
||||
"Imani Reign"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jazmine Adore",
|
||||
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
|
||||
"tags": [
|
||||
"Jazmine Adore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Just Mike Starks",
|
||||
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"tags": [
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Niomie King",
|
||||
"href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
|
||||
"tags": [
|
||||
"Niomie King"
|
||||
]
|
||||
}
|
||||
],
|
||||
"SceneBreakdowns": [
|
||||
{
|
||||
"scene": "Scene 1",
|
||||
"performers": [
|
||||
"Imani Reign",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scene": "Scene 2",
|
||||
"performers": [
|
||||
"Jazmine Adore",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scene": "Scene 3",
|
||||
"performers": [
|
||||
"Gina Ferrero",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scene": "Scene 4",
|
||||
"performers": [
|
||||
"Niomie King",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
}
|
||||
],
|
||||
"AppearsIn": []
|
||||
}
|
||||
26
scripts/iafd/src/config.py
Normal file
26
scripts/iafd/src/config.py
Normal file
@ -0,0 +1,26 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
from datetime import datetime
|
||||
|
||||
global_share_data_dir = '/root/sharedata'
|
||||
global_host_data_dir = '/root/hostdir/scripts_data'
|
||||
|
||||
# 设置日志配置
|
||||
def setup_logging(log_filename=None):
|
||||
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||
if log_filename is None:
|
||||
# 获取调用 setup_logging 的脚本文件名
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
|
||||
# 获取当前日期,格式为 yyyymmdd
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
# 拼接 log 文件名,将日期加在扩展名前
|
||||
log_filename = f'../log/{caller_filename}_{current_date}.log'
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename),
|
||||
logging.StreamHandler()
|
||||
])
|
||||
320
scripts/iafd/src/fetch.py
Normal file
320
scripts/iafd/src/fetch.py
Normal file
@ -0,0 +1,320 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as utils
|
||||
import iafd_scraper as scraper
|
||||
import utils as func
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
debug = True
|
||||
|
||||
# 按星座获取演员列表,无翻页
|
||||
def fetch_performers_by_astro(existed_performer_hrefs):
|
||||
performers = []
|
||||
|
||||
for astro in scraper.astro_list:
|
||||
url = scraper.astr_base_url + astro
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href']
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
break
|
||||
return performers
|
||||
|
||||
|
||||
# 按生日获取演员列表,无翻页
|
||||
def fetch_performers_by_birth(existed_performer_hrefs):
|
||||
performers = []
|
||||
|
||||
for month in range(1, 13): # 遍历1到12月
|
||||
for day in range(1, 32): # 遍历1到31天
|
||||
url = scraper.birth_base_url.format(month=month, day=day)
|
||||
logging.info(f"Fetching data for birth, url {url}")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_birth(soup, month, day)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href']
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
return performers
|
||||
|
||||
return performers
|
||||
|
||||
# 处理带空格的种族名
|
||||
def format_ethnic(ethnic):
|
||||
return ethnic.replace(' ', '+')
|
||||
|
||||
# 按人种获取演员列表,有翻页
|
||||
def fetch_performers_by_ethnic(existed_performer_hrefs):
|
||||
performers = []
|
||||
|
||||
for ethnic in scraper.ethnic_list:
|
||||
url = scraper.ethnic_url + format_ethnic(ethnic)
|
||||
next_url = url
|
||||
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for {ethnic}, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
|
||||
parser="lxml", preprocessor=scraper.preprocess_html)
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href']
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
|
||||
# 调试添加break
|
||||
if debug:
|
||||
return performers
|
||||
return performers
|
||||
|
||||
|
||||
# 获取distributors列表
|
||||
def fetch_distributors_list(existed_distributors_href):
|
||||
url = scraper.distributors_list_url
|
||||
distributors_list = []
|
||||
|
||||
logging.info(f"Fetching data for distributors list, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
dis_url = scraper.distributors_base_url + row['href']
|
||||
if dis_url in existed_distributors_href :
|
||||
continue
|
||||
distributors_list.append({
|
||||
'name' : row['name'],
|
||||
'href' : dis_url
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
return distributors_list
|
||||
|
||||
# 获取studios列表
|
||||
def fetch_studios_list(existed_studios_href):
|
||||
url = scraper.studios_list_url
|
||||
studios_list = []
|
||||
|
||||
logging.info(f"Fetching data for studios list, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
|
||||
if list_data:
|
||||
for row in list_data :
|
||||
stu_url = scraper.studios_base_url + row['href']
|
||||
if stu_url in existed_studios_href:
|
||||
continue
|
||||
studios_list.append({
|
||||
'name' : row['name'],
|
||||
'href' : stu_url
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
return studios_list
|
||||
|
||||
# 获取更新
|
||||
def check_update():
|
||||
# 读取数据库中的演员列表
|
||||
existed_performer_hrefs = utils.query_performer_hrefs()
|
||||
if not existed_performer_hrefs:
|
||||
logging.warning(f'get existed performers from db error.')
|
||||
return None
|
||||
|
||||
# 从列表页获取新的演员
|
||||
new_performers = []
|
||||
#new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
|
||||
#new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
|
||||
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
|
||||
|
||||
# 逐个获取演员信息,并写入到db中
|
||||
new_performers = list({item["href"]: item for item in new_performers}.values())
|
||||
logging.info(f'get new performers count: {len(new_performers)} ')
|
||||
for performer in new_performers:
|
||||
url = performer['href']
|
||||
person = performer['person']
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
if soup:
|
||||
data, credits = scraper.parse_page_performer(soup)
|
||||
if data:
|
||||
performer_id = utils.insert_or_update_performer({
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
})
|
||||
if performer_id:
|
||||
logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert person: {person} {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
func.write_person_json(person, url, {
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data,
|
||||
'credits': credits if credits else {}
|
||||
})
|
||||
else:
|
||||
logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: {person}, url: {url}')
|
||||
# 调试break
|
||||
if debug:
|
||||
break
|
||||
|
||||
# 从数据库读取distributors列表
|
||||
existed_distributors_href = utils.query_distributor_hrefs()
|
||||
if existed_distributors_href is None:
|
||||
logging.warning(f'get existed distributors from db error.')
|
||||
return
|
||||
new_distributors = fetch_distributors_list(existed_distributors_href)
|
||||
for dist in new_distributors:
|
||||
dist_id = utils.insert_or_update_distributor(dist)
|
||||
if dist_id:
|
||||
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
|
||||
else:
|
||||
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
|
||||
|
||||
# 从数据库读取studios列表
|
||||
existed_studios_href = utils.query_studio_hrefs()
|
||||
if existed_studios_href is None:
|
||||
logging.warning(f'get existed studios from db error.')
|
||||
return
|
||||
new_studios = fetch_studios_list(existed_studios_href)
|
||||
for stu in new_studios:
|
||||
stu_id = utils.insert_or_update_studio(stu)
|
||||
if stu_id:
|
||||
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
|
||||
else:
|
||||
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
|
||||
|
||||
# 从数据库中读取影片列表
|
||||
existed_movies = utils.query_movie_hrefs()
|
||||
if existed_movies is None:
|
||||
logging.warning(f'load movies from db error')
|
||||
return
|
||||
new_movies = []
|
||||
new_movie_hrefs = []
|
||||
|
||||
# 遍历所有 distributors,获取 movies 列表
|
||||
existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
|
||||
if existed_distributors_href is None:
|
||||
logging.warning(f'get existed distributors from db error.')
|
||||
return
|
||||
for url in existed_distributors_href:
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
if movie['href'] in existed_movies:
|
||||
continue
|
||||
new_movies.append({
|
||||
'title' : movie['title'],
|
||||
'href' : movie['href']
|
||||
})
|
||||
new_movie_hrefs.append(movie['href'])
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
|
||||
|
||||
# 遍历所有 studios,获取 movies 列表
|
||||
existed_studios_href = utils.query_studio_hrefs(name='vixen')
|
||||
if existed_studios_href is None:
|
||||
logging.warning(f'get existed studios from db error.')
|
||||
return
|
||||
for url in existed_studios_href:
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
|
||||
continue
|
||||
new_movies.append({
|
||||
'title' : movie['title'],
|
||||
'href' : movie['href']
|
||||
})
|
||||
new_movie_hrefs.append(movie['href'])
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
break
|
||||
logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
|
||||
|
||||
# 对新的影片,逐个获取内容
|
||||
new_movies = list({item["href"]: item for item in new_movies}.values())
|
||||
logging.info(f'get merged new movies, count: {len(new_movies)} ')
|
||||
for movie in new_movies:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
if movie_data :
|
||||
movie_id = utils.insert_or_update_movie(movie_data)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
func.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
break
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
if __name__ == "__main__":
|
||||
check_update()
|
||||
513
scripts/iafd/src/iafd_scraper.py
Normal file
513
scripts/iafd/src/iafd_scraper.py
Normal file
@ -0,0 +1,513 @@
|
||||
|
||||
import cloudscraper
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
from functools import partial
|
||||
import config
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = "https://www.iafd.com"
|
||||
|
||||
astr_base_url = f"{host_url}/astrology.rme/sign="
|
||||
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
|
||||
|
||||
birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
|
||||
|
||||
ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
|
||||
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
|
||||
|
||||
distributors_list_url = f'{host_url}/distrib.asp'
|
||||
distributors_base_url = f"{host_url}/distrib.rme/distrib="
|
||||
|
||||
studios_list_url = f"{host_url}/studio.asp"
|
||||
studios_base_url = f"{host_url}/studio.rme/studio="
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = scraper.get(url, headers=headers)
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
if attr_type == "id":
|
||||
return soup.find(tag, id=identifier) is not None
|
||||
elif attr_type == "class":
|
||||
return bool(soup.find_all(tag, class_=identifier))
|
||||
elif attr_type == "name":
|
||||
return bool(soup.find('select', {'name': identifier}))
|
||||
return False
|
||||
|
||||
# 检查电影信息是否存在
|
||||
def movie_validator(soup, table_id):
|
||||
return soup.find("table", id=table_id) is not None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_astro(soup, astro):
|
||||
astro_div = soup.find("div", id="astro")
|
||||
if not astro_div:
|
||||
logging.warning(f"Warning: No 'astro' div found in {astro}")
|
||||
return None, None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
birth_date = None
|
||||
for elem in astro_div.find_all(recursive=False):
|
||||
if elem.name == "h3" and "astroday" in elem.get("class", []):
|
||||
birth_date = elem.get_text(strip=True)
|
||||
elif elem.name == "div" and "perficon" in elem.get("class", []):
|
||||
a_tag = elem.find("a")
|
||||
if a_tag:
|
||||
href = host_url + a_tag["href"]
|
||||
name = a_tag.find("span", class_="perfname")
|
||||
if name:
|
||||
list_data.append({
|
||||
"astrology": astro,
|
||||
"birth_date": birth_date,
|
||||
"person": name.get_text(strip=True),
|
||||
"href": href
|
||||
})
|
||||
flag = True
|
||||
list_cnt = list_cnt +1
|
||||
if flag:
|
||||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
# 解析页面内容并更新birth_map
|
||||
def parse_page_birth(soup, month, day):
|
||||
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
|
||||
if not datarows:
|
||||
return None, None
|
||||
|
||||
flag = False
|
||||
list_cnt = 0
|
||||
list_data = []
|
||||
next_url = None
|
||||
rows = datarows[0].find_all('div', class_='col-sm-4')
|
||||
for row in rows:
|
||||
link_tag = row.find('a')
|
||||
person = link_tag.text.strip() if link_tag else ''
|
||||
href = link_tag['href'] if link_tag else ''
|
||||
href = host_url + href
|
||||
|
||||
# 如果 href 已经在 birth_map 中,跳过
|
||||
flag = True
|
||||
if any(entry['href'] == href for entry in list_data):
|
||||
continue
|
||||
|
||||
# 将数据添加到 birth_map
|
||||
list_data.append({
|
||||
'month': month,
|
||||
'day': day,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
list_cnt = list_cnt +1
|
||||
|
||||
if flag:
|
||||
logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_ethnic(soup, ethnic):
|
||||
rows = soup.find_all('div', class_='row headshotrow')
|
||||
flag = False
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
for row in rows:
|
||||
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
|
||||
link_tag = col.find('a')
|
||||
img_tag = col.find('div', class_='pictag')
|
||||
flag = True
|
||||
|
||||
if link_tag and img_tag:
|
||||
href = host_url + link_tag['href']
|
||||
person = img_tag.text.strip()
|
||||
|
||||
# 将数据存储到 ethnic_map
|
||||
list_data.append({
|
||||
'ethnic': ethnic,
|
||||
'person': person,
|
||||
'href': href
|
||||
})
|
||||
if flag:
|
||||
logging.debug(f"get {len(list_data)} persons from this page.")
|
||||
|
||||
next_page = soup.find('a', rel='next')
|
||||
if next_page:
|
||||
next_url = host_url + next_page['href']
|
||||
logging.debug(f"Found next page: {next_url}")
|
||||
return list_data, next_url
|
||||
else:
|
||||
logging.debug(f"All pages fetched for {ethnic}.")
|
||||
return list_data, None
|
||||
else:
|
||||
return None, None
|
||||
|
||||
# 解析列表页
|
||||
def parse_page_dist_stu_list(soup, select_name):
|
||||
list_data = []
|
||||
next_url = None
|
||||
|
||||
select_element = soup.find('select', {'name': select_name})
|
||||
if select_element :
|
||||
options = select_element.find_all('option')
|
||||
for option in options:
|
||||
value = option.get('value') # 获取 value 属性
|
||||
text = option.text.strip() # 获取文本内容
|
||||
list_data.append({
|
||||
'name' : text,
|
||||
'href' : str(value)
|
||||
})
|
||||
return list_data, next_url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_page_dist_stu(soup, table_id):
|
||||
table = soup.find("table", id=table_id)
|
||||
if not table:
|
||||
logging.warning(f"Warning: No {table_id} table found ")
|
||||
return None, None
|
||||
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 5:
|
||||
title = cols[0].text.strip()
|
||||
label = cols[1].text.strip()
|
||||
year = cols[2].text.strip()
|
||||
rev = cols[3].text.strip()
|
||||
a_href = cols[0].find('a')
|
||||
href = host_url + a_href['href'] if a_href else ''
|
||||
|
||||
list_data.append({
|
||||
'title': title,
|
||||
'label': label,
|
||||
'year': year,
|
||||
'rev': rev,
|
||||
'href': href
|
||||
})
|
||||
return list_data, next_url
|
||||
|
||||
|
||||
# 解析 作品列表,有个人出演,也有导演的
|
||||
def parse_credits_table(table, distributor_list):
|
||||
# 找到thead并跳过
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
thead.decompose() # 去掉thead部分,不需要解析
|
||||
|
||||
# 现在只剩下tbody部分
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else []
|
||||
|
||||
movies = []
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
|
||||
# rows = table.find_all('tr', class_='we')
|
||||
for row in rows:
|
||||
cols = row.find_all('td')
|
||||
if len(cols) >= 6:
|
||||
title = cols[0].text.strip()
|
||||
year = cols[1].text.strip()
|
||||
distributor = cols[2].text.strip().lower()
|
||||
notes = cols[3].text.strip()
|
||||
rev = cols[4].text.strip()
|
||||
formats = cols[5].text.strip()
|
||||
|
||||
for key in distributor_list:
|
||||
if key in distributor:
|
||||
distributor_count[key] += 1
|
||||
|
||||
movies.append({
|
||||
'title': title,
|
||||
'year': year,
|
||||
'distributor': distributor,
|
||||
'notes': notes,
|
||||
'rev': rev,
|
||||
'formats': formats
|
||||
})
|
||||
return movies, distributor_count
|
||||
|
||||
|
||||
# 请求网页并提取所需数据
|
||||
def parse_page_performer(soup):
|
||||
# 提取数据
|
||||
data = {}
|
||||
|
||||
# 定义我们需要的字段名称和HTML中对应的标签
|
||||
fields = {
|
||||
'performer_aka': 'Performer AKA',
|
||||
'birthday': 'Birthday',
|
||||
'astrology': 'Astrology',
|
||||
'birthplace': 'Birthplace',
|
||||
'gender': 'Gender',
|
||||
'years_active': 'Years Active',
|
||||
'ethnicity': 'Ethnicity',
|
||||
'nationality': 'Nationality',
|
||||
'hair_colors': 'Hair Colors',
|
||||
'eye_color': 'Eye Color',
|
||||
'height': 'Height',
|
||||
'weight': 'Weight',
|
||||
'measurements': 'Measurements',
|
||||
'tattoos': 'Tattoos',
|
||||
'piercings': 'Piercings'
|
||||
}
|
||||
reversed_map = {v: k for k, v in fields.items()}
|
||||
|
||||
# 解析表格数据, 获取参演或者导演的列表
|
||||
role_list = ['personal', 'directoral']
|
||||
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
|
||||
credits_list = {}
|
||||
|
||||
# 使用字典来存储统计
|
||||
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
|
||||
for role in role_list:
|
||||
table = soup.find('table', id=role)
|
||||
if table :
|
||||
movies, stat_map = parse_credits_table(table, distributor_list)
|
||||
credits_list[role] = movies
|
||||
# 更新 distributor 统计
|
||||
for distributor in distributor_list:
|
||||
distributor_count[distributor] += stat_map.get(distributor, 0)
|
||||
|
||||
# 统计 movies 数量
|
||||
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
|
||||
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
|
||||
|
||||
# 如果没有找到
|
||||
if len(credits_list) == 0 :
|
||||
logging.warning(f"movie table empty. url: {url} ")
|
||||
|
||||
# 遍历每个 bioheading, 获取metadata
|
||||
bioheadings = soup.find_all('p', class_='bioheading')
|
||||
for bio in bioheadings:
|
||||
heading = bio.text.strip()
|
||||
biodata = None
|
||||
|
||||
# 如果包含 "Performer",需要特殊处理
|
||||
if 'Performer' in heading:
|
||||
heading = 'Performer AKA'
|
||||
biodata_div = bio.find_next('div', class_='biodata')
|
||||
if biodata_div:
|
||||
div_text = biodata_div.get_text(separator='|').strip()
|
||||
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
|
||||
else:
|
||||
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
|
||||
|
||||
# 保存数据
|
||||
if heading in reversed_map:
|
||||
kkey = reversed_map[heading]
|
||||
data[kkey] = biodata
|
||||
|
||||
# 添加统计数据到 data
|
||||
data['movies_cnt'] = movies_cnt
|
||||
data['vixen_cnt'] = distributor_count['vixen']
|
||||
data['blacked_cnt'] = distributor_count['blacked']
|
||||
data['tushy_cnt'] = distributor_count['tushy']
|
||||
data['x_art_cnt'] = distributor_count['x-art']
|
||||
|
||||
return data, credits_list
|
||||
|
||||
|
||||
|
||||
# 解析网页 HTML 并提取电影信息
|
||||
def parse_page_movie(soup, href, title):
|
||||
# 解析电影基础信息
|
||||
movie_data = {}
|
||||
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
|
||||
if info_div:
|
||||
labels = info_div.find_all("p", class_="bioheading")
|
||||
values = info_div.find_all("p", class_="biodata")
|
||||
for label, value in zip(labels, values):
|
||||
key = label.text.strip()
|
||||
val = value.text.strip()
|
||||
if key in ["Distributor", "Studio", "Director"]:
|
||||
link = value.find("a")
|
||||
if link:
|
||||
val = link.text.strip()
|
||||
movie_data[f'{key}Href'] = host_url + link['href']
|
||||
movie_data[key] = val
|
||||
else:
|
||||
return None
|
||||
|
||||
# 解析演职人员信息
|
||||
performers = []
|
||||
cast_divs = soup.find_all("div", class_="castbox")
|
||||
for cast in cast_divs:
|
||||
performer = {}
|
||||
link = cast.find("a")
|
||||
if link:
|
||||
performer["name"] = link.text.strip()
|
||||
performer["href"] = host_url + link["href"]
|
||||
|
||||
performer["tags"] = [
|
||||
tag.strip() for br in cast.find_all("br")
|
||||
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
|
||||
]
|
||||
|
||||
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
|
||||
performers.append(performer)
|
||||
|
||||
# 解析场景拆解
|
||||
scene_breakdowns = []
|
||||
scene_table = soup.find("div", id="sceneinfo")
|
||||
if scene_table:
|
||||
rows = scene_table.find_all("tr")
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) >= 2:
|
||||
scene = cols[0].text.strip() # 场景编号
|
||||
performer_info = cols[1] # 包含表演者及链接信息
|
||||
|
||||
# 获取 <br> 之前的完整 HTML(保留 <i> 标签等格式)
|
||||
performer_html = str(performer_info) # 获取所有HTML内容
|
||||
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
split_html = performer_html.split("<br>") # 按 <br> 进行分割
|
||||
if split_html:
|
||||
performers_html = split_html[0].strip() # 取 <br> 之前的部分
|
||||
else:
|
||||
performers_html = performer_html.strip() # 如果没有 <br>,取全部
|
||||
|
||||
# 解析为纯文本(去除HTML标签,仅提取文本内容)
|
||||
performers_soup = BeautifulSoup(performers_html, "html.parser")
|
||||
performers_text = performers_soup.get_text()
|
||||
|
||||
# 提取表演者
|
||||
scene_performers = [p.strip() for p in performers_text.split(",")]
|
||||
|
||||
# 尝试获取 `webscene` 和 `studio`
|
||||
links_data = {}
|
||||
links = performer_info.find_all("a")
|
||||
if links:
|
||||
webscene_title = links[0].text.strip() if len(links)>0 else None
|
||||
webscene = links[0]["href"] if len(links)>0 else None
|
||||
studio = links[1].text.strip() if len(links)>1 else None
|
||||
studio_lnk = links[1]["href"] if len(links)>1 else None
|
||||
links_data = {
|
||||
"title": webscene_title,
|
||||
"webscene": webscene,
|
||||
"studio": studio,
|
||||
"studio_lnk": studio_lnk,
|
||||
}
|
||||
|
||||
scene_data = {
|
||||
"scene": scene,
|
||||
"performers": scene_performers,
|
||||
**links_data,
|
||||
}
|
||||
scene_breakdowns.append(scene_data)
|
||||
|
||||
appears_in = []
|
||||
appears_divs = soup.find("div", id="appearssection")
|
||||
if appears_divs:
|
||||
rows = appears_divs.find_all("li")
|
||||
for row in rows:
|
||||
lnk = row.find("a")
|
||||
if lnk:
|
||||
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
|
||||
|
||||
|
||||
return {
|
||||
"href": href,
|
||||
"title": title,
|
||||
"Minutes": movie_data.get("Minutes", ""),
|
||||
"Distributor": movie_data.get("Distributor", ""),
|
||||
"Studio": movie_data.get("Studio", ""),
|
||||
"ReleaseDate": movie_data.get("Release Date", ""),
|
||||
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
|
||||
"All-Girl": movie_data.get("All-Girl", ""),
|
||||
"All-Male": movie_data.get("All-Male", ""),
|
||||
"Compilation": movie_data.get("Compilation", ""),
|
||||
"Webscene": movie_data.get("Webscene", ""),
|
||||
"Director": movie_data.get("Director", ""),
|
||||
"DirectorHref": movie_data.get("DirectorHref", ""),
|
||||
"DistributorHref": movie_data.get("DistributorHref", ""),
|
||||
"StudioHref": movie_data.get("StudioHref", ""),
|
||||
"Performers": performers,
|
||||
"SceneBreakdowns": scene_breakdowns,
|
||||
"AppearsIn": appears_in,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
for astro in astro_list:
|
||||
url = astr_base_url + astro
|
||||
next_url = url
|
||||
logging.info(f"Fetching data for {astro}, url {url} ...")
|
||||
|
||||
while True:
|
||||
soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = parse_page_astro(soup, astro)
|
||||
if list_data:
|
||||
print(list_data[0] if len(list_data)>0 else 'no data')
|
||||
break
|
||||
else:
|
||||
logging.info(f"Retrying {next_url} ...")
|
||||
time.sleep(5) # 等待后再重试
|
||||
|
||||
time.sleep(2) # 控制访问频率
|
||||
459
scripts/iafd/src/sqlite_utils.py
Normal file
459
scripts/iafd/src/sqlite_utils.py
Normal file
@ -0,0 +1,459 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import config
|
||||
import utils
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
# 连接 SQLite 数据库
|
||||
DB_PATH = f"{config.global_share_data_dir}/shared.db" # 替换为你的数据库文件
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 获取当前时间
|
||||
def get_current_time():
|
||||
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 插入演员信息
|
||||
def insert_or_update_performer(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors,
|
||||
eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt,
|
||||
blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
gender = excluded.gender,
|
||||
birthday = excluded.birthday,
|
||||
astrology = excluded.astrology,
|
||||
birthplace = excluded.birthplace,
|
||||
years_active = excluded.years_active,
|
||||
ethnicity = excluded.ethnicity,
|
||||
nationality = excluded.nationality,
|
||||
hair_colors = excluded.hair_colors,
|
||||
eye_color = excluded.eye_color,
|
||||
height_str = excluded.height_str,
|
||||
weight_str = excluded.weight_str,
|
||||
measurements = excluded.measurements,
|
||||
tattoos = excluded.tattoos,
|
||||
piercings = excluded.piercings,
|
||||
weight = excluded.weight,
|
||||
height = excluded.height,
|
||||
movies_cnt = excluded.movies_cnt,
|
||||
vixen_cnt = excluded.vixen_cnt,
|
||||
blacked_cnt = excluded.blacked_cnt,
|
||||
tushy_cnt = excluded.tushy_cnt,
|
||||
x_art_cnt = excluded.x_art_cnt,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (
|
||||
data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
|
||||
data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
|
||||
data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')),
|
||||
data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
|
||||
))
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM performers WHERE href = ?", (data["href"],))
|
||||
performer_id = cursor.fetchone()[0]
|
||||
|
||||
# 删除旧的 alias
|
||||
cursor.execute("DELETE FROM performer_aliases WHERE performer_id = ?", (performer_id,))
|
||||
|
||||
# 插入新的 alias
|
||||
for alias in data.get("performer_aka", []):
|
||||
if alias.lower() != "no known aliases":
|
||||
cursor.execute("INSERT INTO performer_aliases (performer_id, alias) VALUES (?, ?)", (performer_id, alias))
|
||||
|
||||
conn.commit()
|
||||
logging.debug(f"成功插入/更新演员: {data['person']}")
|
||||
return performer_id
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
# 按 id 或 href 删除演员
|
||||
def delete_performer(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM performers WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM performers WHERE href = ?", (identifier,))
|
||||
else:
|
||||
logging.warning("无效的删除参数")
|
||||
return
|
||||
conn.commit()
|
||||
logging.info(f"成功删除演员: {identifier}")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# 按 id、href 或 name 查询演员信息
|
||||
def query_performer(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM performers WHERE id = ?", (identifier,))
|
||||
elif "http" in identifier:
|
||||
cursor.execute("SELECT * FROM performers WHERE href = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM performers WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
performer = cursor.fetchone()
|
||||
if performer:
|
||||
cursor.execute("SELECT alias FROM performer_aliases WHERE performer_id = ?", (performer[0],))
|
||||
aliases = [row[0] for row in cursor.fetchall()]
|
||||
result = dict(zip([desc[0] for desc in cursor.description], performer))
|
||||
result["performer_aka"] = aliases
|
||||
return result
|
||||
else:
|
||||
logging.warning(f"未找到演员: {identifier}")
|
||||
return None
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_performer_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM performers WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入或更新发行商 """
|
||||
def insert_or_update_distributor(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO distributors (name, href, updated_at)
|
||||
VALUES (?, ? , datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM distributors WHERE href = ?", (data["href"],))
|
||||
dist_id = cursor.fetchone()[0]
|
||||
if dist_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return dist_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# 删除发行商(按 id 或 name) """
|
||||
def delete_distributor(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM distributors WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM distributors WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除发行商: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# 查询发行商(按 id 或 name) """
|
||||
def query_distributor(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM distributors WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM distributors WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
distributor = cursor.fetchone()
|
||||
if distributor:
|
||||
return dict(zip([desc[0] for desc in cursor.description], distributor))
|
||||
else:
|
||||
logging.warning(f"未找到发行商: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_distributor_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM distributors WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "url" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# """ 插入或更新制作公司 """
|
||||
def insert_or_update_studio(data):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO studios (name, href, updated_at)
|
||||
VALUES (?, ?, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
""", (data["name"], data["href"]))
|
||||
conn.commit()
|
||||
|
||||
# 获取 performer_id
|
||||
cursor.execute("SELECT id FROM studios WHERE href = ?", (data["href"],))
|
||||
stu_id = cursor.fetchone()[0]
|
||||
if stu_id:
|
||||
logging.debug(f"成功插入/更新发行商: {data['name']}")
|
||||
return stu_id
|
||||
else:
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
return None
|
||||
|
||||
# """ 删除制作公司(按 id 或 name) """
|
||||
def delete_studio(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM studios WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM studios WHERE name = ?", (identifier,))
|
||||
conn.commit()
|
||||
logging.info(f"成功删除制作公司: {identifier}")
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"删除失败: {e}")
|
||||
|
||||
# """ 查询制作公司(按 id 或 name) """
|
||||
def query_studio(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM studios WHERE id = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM studios WHERE name LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
studio = cursor.fetchone()
|
||||
if studio:
|
||||
return dict(zip([desc[0] for desc in cursor.description], studio))
|
||||
else:
|
||||
logging.warning(f"未找到制作公司: {identifier}")
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_studio_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM studios WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# """从指定表中通过 href 查找 id"""
|
||||
def get_id_by_href(table: str, href: str) -> int:
|
||||
cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
# """插入或更新电影数据"""
|
||||
def insert_or_update_movie(movie_data):
|
||||
try:
|
||||
# 获取相关 ID
|
||||
distributor_id = get_id_by_href('distributors', movie_data['DistributorHref'])
|
||||
studio_id = get_id_by_href('studios', movie_data['StudioHref'])
|
||||
director_id = get_id_by_href('performers', movie_data['DirectorHref'])
|
||||
|
||||
# 插入或更新电影信息
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date,
|
||||
all_girl, all_male, compilation, webscene, director_id, href, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id,
|
||||
studio_id=excluded.studio_id, release_date=excluded.release_date,
|
||||
added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl,
|
||||
all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
|
||||
director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
|
||||
""",
|
||||
(movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'],
|
||||
movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'],
|
||||
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
||||
)
|
||||
conn.commit()
|
||||
logging.info("Movie inserted/updated: %s", movie_data['title'])
|
||||
|
||||
# 获取插入的 movie_id
|
||||
cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
|
||||
movie_id = cursor.fetchone()[0]
|
||||
|
||||
# 插入 performers_movies 关系表
|
||||
for performer in movie_data.get('Performers', []):
|
||||
performer_id = get_id_by_href('performers', performer['href'])
|
||||
if performer_id:
|
||||
notes = '|'.join(performer['tags'])
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO performers_movies (performer_id, movie_id, role, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
|
||||
""",
|
||||
(performer_id, movie_id, "Actor", notes)
|
||||
)
|
||||
logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
|
||||
else:
|
||||
logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
||||
|
||||
# 插入 movies_appers_in 表
|
||||
for appears in movie_data.get("AppearsIn", []):
|
||||
appears_in_id = get_id_by_href('movies', appears['href'])
|
||||
if appears_in_id:
|
||||
appears_in_id = appears_in_id[0]
|
||||
cursor.execute("""
|
||||
INSERT INTO movies_appers_in (movie_id, appears_in_id, gradation, notes)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(movie_id, appears_in_id) DO NOTHING
|
||||
""", (movie_id, appears_in_id, 1, appears["title"]))
|
||||
else:
|
||||
logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
|
||||
|
||||
conn.commit()
|
||||
return movie_id
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
return None
|
||||
|
||||
# 删除电影数据"""
|
||||
def delete_movie(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("DELETE FROM movies WHERE id = ?", (identifier,))
|
||||
elif isinstance(identifier, str):
|
||||
cursor.execute("DELETE FROM movies WHERE href = ?", (identifier,))
|
||||
else:
|
||||
logging.warning("无效的删除参数")
|
||||
return
|
||||
conn.commit()
|
||||
logging.info(f"Deleted movie with {identifier}")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error("Error deleting movie: %s", e)
|
||||
|
||||
# 查找电影数据"""
|
||||
def query_movies(identifier):
|
||||
try:
|
||||
if isinstance(identifier, int):
|
||||
cursor.execute("SELECT * FROM movies WHERE id = ?", (identifier,))
|
||||
elif "http" in identifier:
|
||||
cursor.execute("SELECT * FROM movies WHERE href = ?", (identifier,))
|
||||
else:
|
||||
cursor.execute("SELECT * FROM movies WHERE title LIKE ?", (f"%{identifier}%",))
|
||||
|
||||
movie = cursor.fetchone()
|
||||
if movie:
|
||||
cursor.execute("SELECT * FROM performer_movie WHERE performer_id = ?", (movie[0],))
|
||||
performers = [row[0] for row in cursor.fetchall()]
|
||||
result = dict(zip([desc[0] for desc in cursor.description], performers))
|
||||
result["performers"] = performers
|
||||
return result
|
||||
else:
|
||||
logging.warning(f"find no data: {identifier}")
|
||||
return None
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_movie_hrefs(**filters):
|
||||
try:
|
||||
sql = "SELECT href FROM movies WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "title" in filters:
|
||||
sql += " AND title LIKE ?"
|
||||
params.append(f"%{filters['title']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
try:
|
||||
with open('../result/detail.json', 'r') as file:
|
||||
performers = json.load(file)
|
||||
for performer in performers:
|
||||
insert_or_update_performer(performer)
|
||||
|
||||
print(query_performer("Kirsten"))
|
||||
#delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
|
||||
print(query_performer_hrefs())
|
||||
except FileNotFoundError:
|
||||
logging.info("detail.json not found, starting fresh.")
|
||||
92
scripts/iafd/src/utils.py
Normal file
92
scripts/iafd/src/utils.py
Normal file
@ -0,0 +1,92 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import logging
|
||||
|
||||
# 解析 height 和 weight(转换成数字)
|
||||
def parse_height(height_str):
|
||||
return 0
|
||||
try:
|
||||
return int(height_str.split("(")[-1].replace(" cm)", ""))
|
||||
except:
|
||||
return None
|
||||
|
||||
def parse_weight(weight_str):
|
||||
return 0
|
||||
try:
|
||||
return int(weight_str.split(" ")[0])
|
||||
except:
|
||||
return None
|
||||
|
||||
update_dir = '../result'
|
||||
performers_dir = f'{update_dir}/performers'
|
||||
movies_dir = f'{update_dir}/movies'
|
||||
|
||||
def uniq_performers(new_performers):
|
||||
try:
|
||||
if not isinstance(new_performers, list):
|
||||
raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
|
||||
|
||||
seen = set()
|
||||
unique_performers = []
|
||||
|
||||
for item in new_performers:
|
||||
if not item or item['href'] is None:
|
||||
raise ValueError(f"Invalid item in new_performers: {item}")
|
||||
|
||||
if item["href"] not in seen:
|
||||
seen.add(item["href"])
|
||||
unique_performers.append(item)
|
||||
|
||||
return unique_performers
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error in remove_duplicate_performers: {e}")
|
||||
return [] # 返回空列表,避免程序崩溃
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
sub_dir = str[:1].lower()
|
||||
full_path = os.path.join(base_dir, sub_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_person_json(person, href, data):
|
||||
# 获取目录
|
||||
person_dir = create_sub_directory(performers_dir, person)
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_movie_json(href, data):
|
||||
# 获取目录
|
||||
movie_id = extract_id_from_href(href)
|
||||
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
Reference in New Issue
Block a user