modify some scripts.

This commit is contained in:
2025-03-04 09:56:05 +08:00
parent a0e78ef77e
commit d4e1953e86
7 changed files with 186 additions and 281 deletions

View File

@ -1,20 +0,0 @@
{
"href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
"title": "Barebackin' Men",
"Minutes": "No Data",
"Distributor": "1 Distribution",
"Studio": "1 Distribution",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Jan 1, 2006",
"All-Girl": "No",
"All-Male": "Yes",
"Compilation": "No",
"Webscene": "",
"Director": "No Data",
"DirectorHref": "",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
"Performers": [],
"SceneBreakdowns": [],
"AppearsIn": []
}

View File

@ -1,56 +0,0 @@
{
"href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
"title": "Slim Goodies POV 2",
"Minutes": "84",
"Distributor": "Exotic Vixen Films",
"Studio": "Exotic Vixen Films",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Jan 17, 2024",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "",
"Director": "Just Mike Starks",
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
"Performers": [
{
"name": "Amica Mea",
"href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
"tags": [
"Amica Mea"
]
},
{
"name": "Baby Breezy",
"href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
"tags": [
"Baby Breezy"
]
},
{
"name": "Blu Mere",
"href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
"tags": [
"Blu Mere"
]
},
{
"name": "Just Mike Starks",
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"tags": [
"Just Mike Starks"
]
},
{
"name": "Mocha Menage",
"href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
"tags": [
"Mocha Menage"
]
}
],
"SceneBreakdowns": [],
"AppearsIn": []
}

View File

@ -1,70 +0,0 @@
{
"href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
"title": "Atlanta U: College Freaks",
"Minutes": "No Data",
"Distributor": "Exotic Vixen Films",
"Studio": "Exotic Vixen Films",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Sep 19, 2020",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "",
"Director": "Just Mike Starks",
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
"Performers": [
{
"name": "Aaliyah Ali",
"href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
"tags": [
"Aaliyah Ali"
]
},
{
"name": "Bones Montana",
"href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
"tags": [
"Bones Montana"
]
},
{
"name": "Cameron Cox",
"href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
"tags": [
"Cameron Cox"
]
},
{
"name": "Crystal Cooper",
"href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
"tags": [
"Crystal Cooper"
]
},
{
"name": "Jazmine Adore",
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
"tags": [
"Jazmine Adore"
]
},
{
"name": "Just Mike Starks",
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"tags": [
"Just Mike Starks"
]
},
{
"name": "Lala Ivey",
"href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
"tags": [
"Lala Ivey"
]
}
],
"SceneBreakdowns": [],
"AppearsIn": []
}

View File

@ -1,85 +0,0 @@
{
"href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
"title": "Slim Goodies POV",
"Minutes": "61",
"Distributor": "Exotic Vixen Films",
"Studio": "Exotic Vixen Films",
"ReleaseDate": "No Data",
"AddedtoIAFDDate": "Sep 19, 2020",
"All-Girl": "No",
"All-Male": "No",
"Compilation": "No",
"Webscene": "",
"Director": "Just Mike Starks",
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
"Performers": [
{
"name": "Gina Ferrero",
"href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
"tags": [
"Gina Ferrero"
]
},
{
"name": "Imani Reign",
"href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
"tags": [
"Imani Reign"
]
},
{
"name": "Jazmine Adore",
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
"tags": [
"Jazmine Adore"
]
},
{
"name": "Just Mike Starks",
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
"tags": [
"Just Mike Starks"
]
},
{
"name": "Niomie King",
"href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
"tags": [
"Niomie King"
]
}
],
"SceneBreakdowns": [
{
"scene": "Scene 1",
"performers": [
"Imani Reign",
"Just Mike Starks"
]
},
{
"scene": "Scene 2",
"performers": [
"Jazmine Adore",
"Just Mike Starks"
]
},
{
"scene": "Scene 3",
"performers": [
"Gina Ferrero",
"Just Mike Starks"
]
},
{
"scene": "Scene 4",
"performers": [
"Niomie King",
"Just Mike Starks"
]
}
],
"AppearsIn": []
}

View File

@ -6,13 +6,13 @@ import argparse
import logging
from functools import partial
import config
import sqlite_utils as utils
import sqlite_utils as db_tools
import iafd_scraper as scraper
import utils as func
import utils
config.setup_logging()
debug = True
debug = False
# 按星座获取演员列表,无翻页
def fetch_performers_by_astro(existed_performer_hrefs):
@ -30,7 +30,7 @@ def fetch_performers_by_astro(existed_performer_hrefs):
if row['href'] not in existed_performer_hrefs:
performers.append({
'person' : row['person'],
'href' : row['href']
'href' : row['href'].lower() if row['href'] else ''
})
else:
logging.warning(f'fetch astro error. {url} ...')
@ -59,7 +59,7 @@ def fetch_performers_by_birth(existed_performer_hrefs):
if row['href'] not in existed_performer_hrefs:
performers.append({
'person' : row['person'],
'href' : row['href']
'href' : row['href'].lower() if row['href'] else ''
})
else:
logging.warning(f'fetch astro error. {url} ...')
@ -95,7 +95,7 @@ def fetch_performers_by_ethnic(existed_performer_hrefs):
if row['href'] not in existed_performer_hrefs:
performers.append({
'person' : row['person'],
'href' : row['href']
'href' : row['href'].lower() if row['href'] else ''
})
else:
logging.warning(f'fetch astro error. {url} ...')
@ -124,7 +124,7 @@ def fetch_distributors_list(existed_distributors_href):
continue
distributors_list.append({
'name' : row['name'],
'href' : dis_url
'href' : dis_url.lower() if dis_url else ''
})
else:
logging.warning(f'fetch astro error. {url} ...')
@ -148,7 +148,7 @@ def fetch_studios_list(existed_studios_href):
continue
studios_list.append({
'name' : row['name'],
'href' : stu_url
'href' : stu_url.lower() if stu_url else ''
})
else:
logging.warning(f'fetch astro error. {url} ...')
@ -159,28 +159,37 @@ def fetch_studios_list(existed_studios_href):
# 获取更新
def check_update():
# 读取数据库中的演员列表
existed_performer_hrefs = utils.query_performer_hrefs()
existed_performer_hrefs = db_tools.query_performer_hrefs()
if not existed_performer_hrefs:
logging.warning(f'get existed performers from db error.')
return None
# 开启任务
task_id = db_tools.insert_task_log()
if task_id is None:
logging.warning(f'insert task log error.')
return None
# 从列表页获取新的演员
new_performers = []
#new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
#new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
if not debug : # 数据量较大debug 模式下跳过
new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
# 逐个获取演员信息并写入到db中
new_performers = list({item["href"]: item for item in new_performers}.values())
logging.info(f'get new performers count: {len(new_performers)} ')
logging.info(f'get new performers count: {len(new_performers)} ')
db_tools.update_task_log(task_id, before_performers=len(existed_performer_hrefs), new_performers=len(new_performers), task_status='Inserting new performers')
for performer in new_performers:
url = performer['href']
person = performer['person']
logging.info(f"Fetching data for performer {person}, url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
if soup:
data, credits = scraper.parse_page_performer(soup)
if data:
performer_id = utils.insert_or_update_performer({
performer_id = db_tools.insert_or_update_performer({
'href': url,
'person': person,
**data
@ -191,7 +200,7 @@ def check_update():
logging.warning(f'insert person: {person} {url} failed.')
# 写入到本地json文件
func.write_person_json(person, url, {
utils.write_person_json(person, url, {
'href': url,
'person': person,
**data,
@ -206,33 +215,35 @@ def check_update():
break
# 从数据库读取distributors列表
existed_distributors_href = utils.query_distributor_hrefs()
existed_distributors_href = db_tools.query_distributor_hrefs()
if existed_distributors_href is None:
logging.warning(f'get existed distributors from db error.')
return
new_distributors = fetch_distributors_list(existed_distributors_href)
db_tools.update_task_log(task_id, before_distributors=len(existed_distributors_href), new_distributors=len(new_distributors), task_status='Inserting new distributors')
for dist in new_distributors:
dist_id = utils.insert_or_update_distributor(dist)
dist_id = db_tools.insert_or_update_distributor(dist)
if dist_id:
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
else:
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
# 从数据库读取studios列表
existed_studios_href = utils.query_studio_hrefs()
existed_studios_href = db_tools.query_studio_hrefs()
if existed_studios_href is None:
logging.warning(f'get existed studios from db error.')
return
new_studios = fetch_studios_list(existed_studios_href)
db_tools.update_task_log(task_id, before_studios=len(existed_studios_href), new_studios=len(new_studios), task_status='Inserting new studios')
for stu in new_studios:
stu_id = utils.insert_or_update_studio(stu)
stu_id = db_tools.insert_or_update_studio(stu)
if stu_id:
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
else:
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
# 从数据库中读取影片列表
existed_movies = utils.query_movie_hrefs()
existed_movies = db_tools.query_movie_hrefs()
if existed_movies is None:
logging.warning(f'load movies from db error')
return
@ -240,11 +251,12 @@ def check_update():
new_movie_hrefs = []
# 遍历所有 distributors获取 movies 列表
existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
existed_distributors_href = db_tools.query_distributor_hrefs(name='vixen')
if existed_distributors_href is None:
logging.warning(f'get existed distributors from db error.')
return
for url in existed_distributors_href:
logging.info(f"Fetching data for distributor url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
@ -265,11 +277,12 @@ def check_update():
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
# 遍历所有 studios获取 movies 列表
existed_studios_href = utils.query_studio_hrefs(name='vixen')
existed_studios_href = db_tools.query_studio_hrefs(name='vixen')
if existed_studios_href is None:
logging.warning(f'get existed studios from db error.')
return
for url in existed_studios_href:
logging.info(f"Fetching data for studio url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
@ -292,21 +305,28 @@ def check_update():
# 对新的影片,逐个获取内容
new_movies = list({item["href"]: item for item in new_movies}.values())
logging.info(f'get merged new movies, count: {len(new_movies)} ')
db_tools.update_task_log(task_id, before_movies=len(existed_movies), new_movies=len(new_movies), task_status='Inserting new movies')
for movie in new_movies:
url = movie['href']
title = movie['title']
logging.info(f"Fetching data for movie {title}, url {url} ...")
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
if soup:
movie_data = scraper.parse_page_movie(soup, url, title)
if movie_data :
movie_id = utils.insert_or_update_movie(movie_data)
# 修复url不规范的问题
if movie_data['DistributorHref']:
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
if movie_data['StudioHref']:
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
movie_id = db_tools.insert_or_update_movie(movie_data)
if movie_id:
logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
else:
logging.warning(f'insert movie {url} failed.')
# 写入到本地json文件
func.write_movie_json(url, movie_data)
utils.write_movie_json(url, movie_data)
else:
logging.warning(f'parse_page_movie error. url: {url}')
else:
@ -315,6 +335,71 @@ def check_update():
if debug:
break
# TODO:
# 1, appearsIn 因为影片入库的先后顺序不可控,会出现无法插入 movies_appers_in 表的情况应该要先记录下待处理的movie所有记录插入完成后再做处理
# 2, movie 的更新涉及到performers的几个统计字段的更新应该要找到本次tasklog启动后插入到 performers_movies 表里的所有performers刷新其统计数据也可以简单粗暴的全量更新
# 3, 目前performers_movies以movies爬取的信息为主来更新perfomers爬取的信息应该可以作为检验尤其是perfomers页面有notes字段
logging.info(f'all process completed!')
db_tools.finalize_task_log(task_id)
# 处理本地数据
def load_data():
# 导入已经在本地的 performers 数据
perfomers_file = '../result/detail.json'
performers_data = utils.read_json(perfomers_file)
if performers_data is None:
print(f'read file error.')
performers_data = []
for person in performers_data:
performer_id = db_tools.insert_or_update_performer(person)
if performer_id:
logging.info(f'insert one person, id: {performer_id}, person: {person['person']}, url: {person['href']}')
else:
logging.warning(f'insert person: {person['person']}, {person['href']} failed.')
# 导入已经在本地的 movies 数据
movies_file = '../result/movie_details.json'
movies_data = utils.read_json(movies_file)
if movies_data is None:
print(f'read file error.')
movies_data = []
for movie in movies_data:
# 修复url不规范的问题
if movie['DistributorHref']:
movie['DistributorHref'] = utils.dist_stu_href_rewrite(movie['DistributorHref'].lower())
if movie['StudioHref']:
movie['StudioHref'] = utils.dist_stu_href_rewrite(movie['StudioHref'].lower())
movie_id = db_tools.insert_or_update_movie(movie)
if movie_id:
logging.info(f'insert one movie, id: {movie_id}, title: {movie['title']} url: {movie['href']}')
else:
logging.warning(f'insert movie {movie['title']}, {movie['href']} failed.')
logging.info('task completed.')
# 主函数
def main(task, args_debug):
global debug
debug = args_debug
if debug:
logging.info('Debug mode enabled.')
if task == 'fetch':
check_update()
elif task == 'load':
load_data()
else:
print(f'unkown command. see --help.')
if __name__ == "__main__":
check_update()
# 命令行参数处理
parser = argparse.ArgumentParser(description='fetch iafd data.')
parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
args = parser.parse_args()
main(args.task, args.debug)

View File

@ -137,7 +137,7 @@ def query_performer_hrefs(**filters):
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
return [row[0].lower() for row in cursor.fetchall()] # 返回小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
@ -217,7 +217,7 @@ def query_distributor_hrefs(**filters):
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
@ -296,7 +296,7 @@ def query_studio_hrefs(**filters):
params.append(f"%{filters['name']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
@ -334,7 +334,7 @@ def insert_or_update_movie(movie_data):
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
)
conn.commit()
logging.info("Movie inserted/updated: %s", movie_data['title'])
logging.debug("Movie inserted/updated: %s", movie_data['title'])
# 获取插入的 movie_id
cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
@ -437,12 +437,62 @@ def query_movie_hrefs(**filters):
params.append(f"%{filters['title']}%")
cursor.execute(sql, params)
return [row[0] for row in cursor.fetchall()]
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
# 插入一条任务日志
def insert_task_log():
try:
cursor.execute("""
INSERT INTO task_log (task_status) VALUES ('Start')
""")
conn.commit()
return cursor.lastrowid # 获取插入的 task_id
except sqlite3.Error as e:
logging.error(f"插入任务失败: {e}")
return None
# 更新任务日志的字段
def update_task_log(task_id, **kwargs):
try:
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
params = list(kwargs.values()) + [task_id]
sql = f"UPDATE task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
cursor.execute(sql, params)
conn.commit()
except sqlite3.Error as e:
logging.error(f"更新任务 {task_id} 失败: {e}")
# 任务结束,更新字段
def finalize_task_log(task_id):
try:
# 获取 performers、studios 等表的最终行数
cursor.execute("SELECT COUNT(*) FROM performers")
after_performers = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM movies")
after_movies = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM distributors")
after_distributors = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM studios")
after_studios = cursor.fetchone()[0]
# 更新 task_log
update_task_log(task_id,
after_performers=after_performers,
after_movies=after_movies,
after_distributors=after_distributors,
after_studios=after_studios,
task_status="Success")
except sqlite3.Error as e:
logging.error(f"任务 {task_id} 结束失败: {e}")
if __name__ == "__main__":

View File

@ -24,27 +24,16 @@ update_dir = '../result'
performers_dir = f'{update_dir}/performers'
movies_dir = f'{update_dir}/movies'
def uniq_performers(new_performers):
try:
if not isinstance(new_performers, list):
raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
def dist_stu_href_rewrite(href):
# 提取 ID适用于 distrib 或 studio
import re
match = re.search(r"(distrib|studio)=(\d+)", href)
if not match:
return None # 不是目标 URL返回 None
seen = set()
unique_performers = []
for item in new_performers:
if not item or item['href'] is None:
raise ValueError(f"Invalid item in new_performers: {item}")
if item["href"] not in seen:
seen.add(item["href"])
unique_performers.append(item)
return unique_performers
except Exception as e:
logging.error(f"Error in remove_duplicate_performers: {e}")
return [] # 返回空列表,避免程序崩溃
key, id_number = match.groups()
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
return new_url
# 创建目录
def create_sub_directory(base_dir, str):
@ -90,3 +79,15 @@ def write_movie_json(href, data):
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 读取json文件并返回内容
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return None
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return None