modify some scripts.
This commit is contained in:
@ -1,20 +0,0 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
|
||||
"title": "Barebackin' Men",
|
||||
"Minutes": "No Data",
|
||||
"Distributor": "1 Distribution",
|
||||
"Studio": "1 Distribution",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Jan 1, 2006",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "Yes",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "No Data",
|
||||
"DirectorHref": "",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
|
||||
"Performers": [],
|
||||
"SceneBreakdowns": [],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -1,56 +0,0 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
|
||||
"title": "Slim Goodies POV 2",
|
||||
"Minutes": "84",
|
||||
"Distributor": "Exotic Vixen Films",
|
||||
"Studio": "Exotic Vixen Films",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Jan 17, 2024",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "No",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "Just Mike Starks",
|
||||
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"Performers": [
|
||||
{
|
||||
"name": "Amica Mea",
|
||||
"href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
|
||||
"tags": [
|
||||
"Amica Mea"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Baby Breezy",
|
||||
"href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
|
||||
"tags": [
|
||||
"Baby Breezy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Blu Mere",
|
||||
"href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
|
||||
"tags": [
|
||||
"Blu Mere"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Just Mike Starks",
|
||||
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"tags": [
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Mocha Menage",
|
||||
"href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
|
||||
"tags": [
|
||||
"Mocha Menage"
|
||||
]
|
||||
}
|
||||
],
|
||||
"SceneBreakdowns": [],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -1,70 +0,0 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
|
||||
"title": "Atlanta U: College Freaks",
|
||||
"Minutes": "No Data",
|
||||
"Distributor": "Exotic Vixen Films",
|
||||
"Studio": "Exotic Vixen Films",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Sep 19, 2020",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "No",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "Just Mike Starks",
|
||||
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"Performers": [
|
||||
{
|
||||
"name": "Aaliyah Ali",
|
||||
"href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
|
||||
"tags": [
|
||||
"Aaliyah Ali"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Bones Montana",
|
||||
"href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
|
||||
"tags": [
|
||||
"Bones Montana"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Cameron Cox",
|
||||
"href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
|
||||
"tags": [
|
||||
"Cameron Cox"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Crystal Cooper",
|
||||
"href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
|
||||
"tags": [
|
||||
"Crystal Cooper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jazmine Adore",
|
||||
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
|
||||
"tags": [
|
||||
"Jazmine Adore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Just Mike Starks",
|
||||
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"tags": [
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Lala Ivey",
|
||||
"href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
|
||||
"tags": [
|
||||
"Lala Ivey"
|
||||
]
|
||||
}
|
||||
],
|
||||
"SceneBreakdowns": [],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -1,85 +0,0 @@
|
||||
{
|
||||
"href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
|
||||
"title": "Slim Goodies POV",
|
||||
"Minutes": "61",
|
||||
"Distributor": "Exotic Vixen Films",
|
||||
"Studio": "Exotic Vixen Films",
|
||||
"ReleaseDate": "No Data",
|
||||
"AddedtoIAFDDate": "Sep 19, 2020",
|
||||
"All-Girl": "No",
|
||||
"All-Male": "No",
|
||||
"Compilation": "No",
|
||||
"Webscene": "",
|
||||
"Director": "Just Mike Starks",
|
||||
"DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
|
||||
"Performers": [
|
||||
{
|
||||
"name": "Gina Ferrero",
|
||||
"href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
|
||||
"tags": [
|
||||
"Gina Ferrero"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Imani Reign",
|
||||
"href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
|
||||
"tags": [
|
||||
"Imani Reign"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Jazmine Adore",
|
||||
"href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
|
||||
"tags": [
|
||||
"Jazmine Adore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Just Mike Starks",
|
||||
"href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
|
||||
"tags": [
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Niomie King",
|
||||
"href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
|
||||
"tags": [
|
||||
"Niomie King"
|
||||
]
|
||||
}
|
||||
],
|
||||
"SceneBreakdowns": [
|
||||
{
|
||||
"scene": "Scene 1",
|
||||
"performers": [
|
||||
"Imani Reign",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scene": "Scene 2",
|
||||
"performers": [
|
||||
"Jazmine Adore",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scene": "Scene 3",
|
||||
"performers": [
|
||||
"Gina Ferrero",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"scene": "Scene 4",
|
||||
"performers": [
|
||||
"Niomie King",
|
||||
"Just Mike Starks"
|
||||
]
|
||||
}
|
||||
],
|
||||
"AppearsIn": []
|
||||
}
|
||||
@ -6,13 +6,13 @@ import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as utils
|
||||
import sqlite_utils as db_tools
|
||||
import iafd_scraper as scraper
|
||||
import utils as func
|
||||
import utils
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
debug = True
|
||||
debug = False
|
||||
|
||||
# 按星座获取演员列表,无翻页
|
||||
def fetch_performers_by_astro(existed_performer_hrefs):
|
||||
@ -30,7 +30,7 @@ def fetch_performers_by_astro(existed_performer_hrefs):
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href']
|
||||
'href' : row['href'].lower() if row['href'] else ''
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
@ -59,7 +59,7 @@ def fetch_performers_by_birth(existed_performer_hrefs):
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href']
|
||||
'href' : row['href'].lower() if row['href'] else ''
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
@ -95,7 +95,7 @@ def fetch_performers_by_ethnic(existed_performer_hrefs):
|
||||
if row['href'] not in existed_performer_hrefs:
|
||||
performers.append({
|
||||
'person' : row['person'],
|
||||
'href' : row['href']
|
||||
'href' : row['href'].lower() if row['href'] else ''
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
@ -124,7 +124,7 @@ def fetch_distributors_list(existed_distributors_href):
|
||||
continue
|
||||
distributors_list.append({
|
||||
'name' : row['name'],
|
||||
'href' : dis_url
|
||||
'href' : dis_url.lower() if dis_url else ''
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
@ -148,7 +148,7 @@ def fetch_studios_list(existed_studios_href):
|
||||
continue
|
||||
studios_list.append({
|
||||
'name' : row['name'],
|
||||
'href' : stu_url
|
||||
'href' : stu_url.lower() if stu_url else ''
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch astro error. {url} ...')
|
||||
@ -159,28 +159,37 @@ def fetch_studios_list(existed_studios_href):
|
||||
# 获取更新
|
||||
def check_update():
|
||||
# 读取数据库中的演员列表
|
||||
existed_performer_hrefs = utils.query_performer_hrefs()
|
||||
existed_performer_hrefs = db_tools.query_performer_hrefs()
|
||||
if not existed_performer_hrefs:
|
||||
logging.warning(f'get existed performers from db error.')
|
||||
return None
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
# 从列表页获取新的演员
|
||||
new_performers = []
|
||||
#new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
|
||||
#new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
|
||||
if not debug : # 数据量较大,debug 模式下跳过
|
||||
new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
|
||||
new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
|
||||
new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
|
||||
|
||||
# 逐个获取演员信息,并写入到db中
|
||||
new_performers = list({item["href"]: item for item in new_performers}.values())
|
||||
logging.info(f'get new performers count: {len(new_performers)} ')
|
||||
db_tools.update_task_log(task_id, before_performers=len(existed_performer_hrefs), new_performers=len(new_performers), task_status='Inserting new performers')
|
||||
for performer in new_performers:
|
||||
url = performer['href']
|
||||
person = performer['person']
|
||||
logging.info(f"Fetching data for performer {person}, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
|
||||
if soup:
|
||||
data, credits = scraper.parse_page_performer(soup)
|
||||
if data:
|
||||
performer_id = utils.insert_or_update_performer({
|
||||
performer_id = db_tools.insert_or_update_performer({
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data
|
||||
@ -191,7 +200,7 @@ def check_update():
|
||||
logging.warning(f'insert person: {person} {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
func.write_person_json(person, url, {
|
||||
utils.write_person_json(person, url, {
|
||||
'href': url,
|
||||
'person': person,
|
||||
**data,
|
||||
@ -206,33 +215,35 @@ def check_update():
|
||||
break
|
||||
|
||||
# 从数据库读取distributors列表
|
||||
existed_distributors_href = utils.query_distributor_hrefs()
|
||||
existed_distributors_href = db_tools.query_distributor_hrefs()
|
||||
if existed_distributors_href is None:
|
||||
logging.warning(f'get existed distributors from db error.')
|
||||
return
|
||||
new_distributors = fetch_distributors_list(existed_distributors_href)
|
||||
db_tools.update_task_log(task_id, before_distributors=len(existed_distributors_href), new_distributors=len(new_distributors), task_status='Inserting new distributors')
|
||||
for dist in new_distributors:
|
||||
dist_id = utils.insert_or_update_distributor(dist)
|
||||
dist_id = db_tools.insert_or_update_distributor(dist)
|
||||
if dist_id:
|
||||
logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
|
||||
else:
|
||||
logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
|
||||
|
||||
# 从数据库读取studios列表
|
||||
existed_studios_href = utils.query_studio_hrefs()
|
||||
existed_studios_href = db_tools.query_studio_hrefs()
|
||||
if existed_studios_href is None:
|
||||
logging.warning(f'get existed studios from db error.')
|
||||
return
|
||||
new_studios = fetch_studios_list(existed_studios_href)
|
||||
db_tools.update_task_log(task_id, before_studios=len(existed_studios_href), new_studios=len(new_studios), task_status='Inserting new studios')
|
||||
for stu in new_studios:
|
||||
stu_id = utils.insert_or_update_studio(stu)
|
||||
stu_id = db_tools.insert_or_update_studio(stu)
|
||||
if stu_id:
|
||||
logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
|
||||
else:
|
||||
logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
|
||||
|
||||
# 从数据库中读取影片列表
|
||||
existed_movies = utils.query_movie_hrefs()
|
||||
existed_movies = db_tools.query_movie_hrefs()
|
||||
if existed_movies is None:
|
||||
logging.warning(f'load movies from db error')
|
||||
return
|
||||
@ -240,11 +251,12 @@ def check_update():
|
||||
new_movie_hrefs = []
|
||||
|
||||
# 遍历所有 distributors,获取 movies 列表
|
||||
existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
|
||||
existed_distributors_href = db_tools.query_distributor_hrefs(name='vixen')
|
||||
if existed_distributors_href is None:
|
||||
logging.warning(f'get existed distributors from db error.')
|
||||
return
|
||||
for url in existed_distributors_href:
|
||||
logging.info(f"Fetching data for distributor url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
|
||||
@ -265,11 +277,12 @@ def check_update():
|
||||
logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
|
||||
|
||||
# 遍历所有 studios,获取 movies 列表
|
||||
existed_studios_href = utils.query_studio_hrefs(name='vixen')
|
||||
existed_studios_href = db_tools.query_studio_hrefs(name='vixen')
|
||||
if existed_studios_href is None:
|
||||
logging.warning(f'get existed studios from db error.')
|
||||
return
|
||||
for url in existed_studios_href:
|
||||
logging.info(f"Fetching data for studio url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
|
||||
@ -292,21 +305,28 @@ def check_update():
|
||||
# 对新的影片,逐个获取内容
|
||||
new_movies = list({item["href"]: item for item in new_movies}.values())
|
||||
logging.info(f'get merged new movies, count: {len(new_movies)} ')
|
||||
db_tools.update_task_log(task_id, before_movies=len(existed_movies), new_movies=len(new_movies), task_status='Inserting new movies')
|
||||
for movie in new_movies:
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie {title}, url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_page_movie(soup, url, title)
|
||||
if movie_data :
|
||||
movie_id = utils.insert_or_update_movie(movie_data)
|
||||
# 修复url不规范的问题
|
||||
if movie_data['DistributorHref']:
|
||||
movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
|
||||
if movie_data['StudioHref']:
|
||||
movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
|
||||
movie_id = db_tools.insert_or_update_movie(movie_data)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
|
||||
else:
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
|
||||
# 写入到本地json文件
|
||||
func.write_movie_json(url, movie_data)
|
||||
utils.write_movie_json(url, movie_data)
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
else:
|
||||
@ -315,6 +335,71 @@ def check_update():
|
||||
if debug:
|
||||
break
|
||||
|
||||
# TODO:
|
||||
# 1, appearsIn 因为影片入库的先后顺序不可控,会出现无法插入 movies_appers_in 表的情况,应该要先记录下待处理的movie,所有记录插入完成后再做处理
|
||||
# 2, movie 的更新,涉及到performers的几个统计字段的更新,应该要找到本次tasklog启动后插入到 performers_movies 表里的所有performers,刷新其统计数据;也可以简单粗暴的全量更新
|
||||
# 3, 目前performers_movies以movies爬取的信息为主来更新,perfomers爬取的信息应该可以作为检验,尤其是perfomers页面有notes字段
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
if __name__ == "__main__":
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
|
||||
# 处理本地数据
|
||||
def load_data():
|
||||
# 导入已经在本地的 performers 数据
|
||||
perfomers_file = '../result/detail.json'
|
||||
performers_data = utils.read_json(perfomers_file)
|
||||
if performers_data is None:
|
||||
print(f'read file error.')
|
||||
performers_data = []
|
||||
for person in performers_data:
|
||||
performer_id = db_tools.insert_or_update_performer(person)
|
||||
if performer_id:
|
||||
logging.info(f'insert one person, id: {performer_id}, person: {person['person']}, url: {person['href']}')
|
||||
else:
|
||||
logging.warning(f'insert person: {person['person']}, {person['href']} failed.')
|
||||
|
||||
# 导入已经在本地的 movies 数据
|
||||
movies_file = '../result/movie_details.json'
|
||||
movies_data = utils.read_json(movies_file)
|
||||
if movies_data is None:
|
||||
print(f'read file error.')
|
||||
movies_data = []
|
||||
for movie in movies_data:
|
||||
# 修复url不规范的问题
|
||||
if movie['DistributorHref']:
|
||||
movie['DistributorHref'] = utils.dist_stu_href_rewrite(movie['DistributorHref'].lower())
|
||||
if movie['StudioHref']:
|
||||
movie['StudioHref'] = utils.dist_stu_href_rewrite(movie['StudioHref'].lower())
|
||||
movie_id = db_tools.insert_or_update_movie(movie)
|
||||
if movie_id:
|
||||
logging.info(f'insert one movie, id: {movie_id}, title: {movie['title']} url: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie {movie['title']}, {movie['href']} failed.')
|
||||
|
||||
logging.info('task completed.')
|
||||
|
||||
|
||||
# 主函数
|
||||
def main(task, args_debug):
|
||||
global debug
|
||||
debug = args_debug
|
||||
if debug:
|
||||
logging.info('Debug mode enabled.')
|
||||
|
||||
if task == 'fetch':
|
||||
check_update()
|
||||
elif task == 'load':
|
||||
load_data()
|
||||
else:
|
||||
print(f'unkown command. see --help.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
parser = argparse.ArgumentParser(description='fetch iafd data.')
|
||||
parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.task, args.debug)
|
||||
|
||||
@ -137,7 +137,7 @@ def query_performer_hrefs(**filters):
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 返回小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
@ -217,7 +217,7 @@ def query_distributor_hrefs(**filters):
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
@ -296,7 +296,7 @@ def query_studio_hrefs(**filters):
|
||||
params.append(f"%{filters['name']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
@ -334,7 +334,7 @@ def insert_or_update_movie(movie_data):
|
||||
movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
|
||||
)
|
||||
conn.commit()
|
||||
logging.info("Movie inserted/updated: %s", movie_data['title'])
|
||||
logging.debug("Movie inserted/updated: %s", movie_data['title'])
|
||||
|
||||
# 获取插入的 movie_id
|
||||
cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
|
||||
@ -437,12 +437,62 @@ def query_movie_hrefs(**filters):
|
||||
params.append(f"%{filters['title']}%")
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
return [row[0].lower() for row in cursor.fetchall()] # 链接使用小写
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return []
|
||||
|
||||
# 插入一条任务日志
|
||||
def insert_task_log():
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO task_log (task_status) VALUES ('Start')
|
||||
""")
|
||||
conn.commit()
|
||||
return cursor.lastrowid # 获取插入的 task_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"插入任务失败: {e}")
|
||||
return None
|
||||
|
||||
# 更新任务日志的字段
|
||||
def update_task_log(task_id, **kwargs):
|
||||
try:
|
||||
fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
||||
params = list(kwargs.values()) + [task_id]
|
||||
|
||||
sql = f"UPDATE task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
|
||||
cursor.execute(sql, params)
|
||||
conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"更新任务 {task_id} 失败: {e}")
|
||||
|
||||
# 任务结束,更新字段
|
||||
def finalize_task_log(task_id):
|
||||
try:
|
||||
# 获取 performers、studios 等表的最终行数
|
||||
cursor.execute("SELECT COUNT(*) FROM performers")
|
||||
after_performers = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM movies")
|
||||
after_movies = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM distributors")
|
||||
after_distributors = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM studios")
|
||||
after_studios = cursor.fetchone()[0]
|
||||
|
||||
# 更新 task_log
|
||||
update_task_log(task_id,
|
||||
after_performers=after_performers,
|
||||
after_movies=after_movies,
|
||||
after_distributors=after_distributors,
|
||||
after_studios=after_studios,
|
||||
task_status="Success")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"任务 {task_id} 结束失败: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
@ -24,27 +24,16 @@ update_dir = '../result'
|
||||
performers_dir = f'{update_dir}/performers'
|
||||
movies_dir = f'{update_dir}/movies'
|
||||
|
||||
def uniq_performers(new_performers):
|
||||
try:
|
||||
if not isinstance(new_performers, list):
|
||||
raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
|
||||
def dist_stu_href_rewrite(href):
|
||||
# 提取 ID(适用于 distrib 或 studio)
|
||||
import re
|
||||
match = re.search(r"(distrib|studio)=(\d+)", href)
|
||||
if not match:
|
||||
return None # 不是目标 URL,返回 None
|
||||
|
||||
seen = set()
|
||||
unique_performers = []
|
||||
|
||||
for item in new_performers:
|
||||
if not item or item['href'] is None:
|
||||
raise ValueError(f"Invalid item in new_performers: {item}")
|
||||
|
||||
if item["href"] not in seen:
|
||||
seen.add(item["href"])
|
||||
unique_performers.append(item)
|
||||
|
||||
return unique_performers
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error in remove_duplicate_performers: {e}")
|
||||
return [] # 返回空列表,避免程序崩溃
|
||||
key, id_number = match.groups()
|
||||
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
|
||||
return new_url
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
@ -90,3 +79,15 @@ def write_movie_json(href, data):
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
# 读取json文件并返回内容
|
||||
def read_json(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"文件 {file_path} 未找到.")
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
print(f"文件 {file_path} 解析错误.")
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user