modify some scripts.
This commit is contained in:
@ -20,26 +20,29 @@ def fetch_actor_list():
|
||||
next_url = scraper.actors_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
# 写入数据库
|
||||
for row in list_data:
|
||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row['href'] if row['href'] else '')
|
||||
actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
|
||||
if actor_id:
|
||||
logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取makers列表
|
||||
def fetch_makers_list():
|
||||
next_url = scraper.makers_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
@ -53,12 +56,16 @@ def fetch_makers_list():
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 获取series列表
|
||||
def fetch_series_list():
|
||||
next_url = scraper.series_uncensored_base_url
|
||||
while next_url:
|
||||
logging.info(f'fetching page {next_url}')
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
|
||||
if list_data :
|
||||
@ -72,6 +79,10 @@ def fetch_series_list():
|
||||
else:
|
||||
logging.warning(f'fetch actor error. {next_url} ...')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_maker():
|
||||
@ -79,21 +90,27 @@ def fetch_movies_by_maker():
|
||||
if debug:
|
||||
url_list = db_tools.query_maker_hrefs(name='muramura')
|
||||
for url in url_list:
|
||||
next_url = url
|
||||
while True:
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for maker url {next_url} ...")
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_maker_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
@ -104,21 +121,26 @@ def fetch_movies_by_series():
|
||||
if debug:
|
||||
url_list = db_tools.query_series_hrefs(name='10musume')
|
||||
for url in url_list:
|
||||
next_url = url
|
||||
while True:
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = utils.remove_url_query(url)
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for series url {next_url} ...")
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
for movie in list_data:
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
|
||||
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
|
||||
if tmp_id:
|
||||
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
|
||||
else:
|
||||
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {next_url}')
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
@ -129,23 +151,31 @@ def fetch_performers_detail():
|
||||
perfomers_list = []
|
||||
while True:
|
||||
# 每次从数据库中取一部分,避免一次全量获取
|
||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=10)
|
||||
perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
|
||||
if len(perfomers_list) < 1:
|
||||
logging.info(f'all performers fetched.')
|
||||
break
|
||||
for performer in perfomers_list:
|
||||
url = performer['href']
|
||||
person = performer['name']
|
||||
pic = ''
|
||||
alias = []
|
||||
|
||||
next_url = url
|
||||
all_movies = []
|
||||
while next_url:
|
||||
logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
|
||||
soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
|
||||
if soup:
|
||||
data, next_url = scraper.parse_actor_detail(soup, next_url)
|
||||
if data:
|
||||
all_movies.extend(data)
|
||||
pic = data.get('pic', '')
|
||||
alias = data.get('alias', [])
|
||||
all_movies.extend(data.get('movies', []))
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. person: ({person}), url: {url}')
|
||||
|
||||
@ -153,8 +183,8 @@ def fetch_performers_detail():
|
||||
performer_id = db_tools.insert_or_update_actor({
|
||||
'href': url,
|
||||
'name': person,
|
||||
'pic' : '',
|
||||
'alias' : [],
|
||||
'pic' : pic,
|
||||
'alias' : alias,
|
||||
'credits':all_movies
|
||||
})
|
||||
if performer_id:
|
||||
@ -169,7 +199,7 @@ def fetch_performers_detail():
|
||||
def fetch_movies_detail():
|
||||
movies_list = []
|
||||
while True:
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=10)
|
||||
movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
|
||||
if len(movies_list) < 1:
|
||||
logging.info(f'all movies fetched.')
|
||||
break
|
||||
@ -177,7 +207,7 @@ def fetch_movies_detail():
|
||||
url = movie['href']
|
||||
title = movie['title']
|
||||
logging.info(f"Fetching data for movie ({title}), url {url} ...")
|
||||
soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
|
||||
if soup:
|
||||
movie_data = scraper.parse_movie_detail(soup, url, title)
|
||||
if movie_data :
|
||||
@ -188,84 +218,76 @@ def fetch_movies_detail():
|
||||
logging.warning(f'insert movie {url} failed.')
|
||||
else:
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
|
||||
break
|
||||
else:
|
||||
logging.warning(f'fetch_page error. url: {url}')
|
||||
# 调试增加break
|
||||
if debug:
|
||||
return True
|
||||
|
||||
# 获取更新
|
||||
def check_update():
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"actor_list": fetch_actor_list,
|
||||
"maker_list": fetch_makers_list,
|
||||
"series_list": fetch_series_list,
|
||||
"makers": fetch_movies_by_maker,
|
||||
"series" : fetch_movies_by_series,
|
||||
"movies" : fetch_movies_detail,
|
||||
"actors" : fetch_performers_detail,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
def main(cmd, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
# 开启任务
|
||||
task_id = db_tools.insert_task_log()
|
||||
if task_id is None:
|
||||
logging.warning(f'insert task log error.')
|
||||
return None
|
||||
|
||||
logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
|
||||
|
||||
if False:
|
||||
# 刷新演员列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching actor list')
|
||||
fetch_actor_list()
|
||||
|
||||
# 刷新makers列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching maker list')
|
||||
fetch_makers_list()
|
||||
|
||||
# 刷新series列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching series list')
|
||||
fetch_series_list()
|
||||
|
||||
# 刷新影片列表
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by maker')
|
||||
fetch_movies_by_maker()
|
||||
db_tools.update_task_log(task_id, task_status='fetching movie list by series')
|
||||
fetch_movies_by_series()
|
||||
|
||||
# 更新演员信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching performers')
|
||||
fetch_performers_detail()
|
||||
|
||||
# 更新影片信息
|
||||
db_tools.update_task_log(task_id, task_status='fetching movies')
|
||||
fetch_movies_detail()
|
||||
# 执行指定的函数
|
||||
if cmd:
|
||||
function_names = args.cmd.split(",") # 拆分输入
|
||||
for short_name in function_names:
|
||||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
else: # 全量执行
|
||||
for name, func in function_map.items():
|
||||
if callable(func):
|
||||
db_tools.update_task_log(task_id, task_status=f'Running {func}')
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1,
|
||||
|
||||
# 处理本地数据
|
||||
def load_data():
|
||||
return True
|
||||
|
||||
# 主函数
|
||||
def main(task, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
if debug:
|
||||
logging.info('Debug mode enabled.')
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
if force:
|
||||
logging.info('force update for all data.')
|
||||
|
||||
if task == 'fetch':
|
||||
check_update()
|
||||
elif task == 'load':
|
||||
load_data()
|
||||
else:
|
||||
print(f'unkown command. see --help.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
parser = argparse.ArgumentParser(description='fetch iafd data.')
|
||||
parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch javdb data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.task, args.debug, args.force)
|
||||
main(args.cmd, args.debug, args.force)
|
||||
|
||||
@ -30,9 +30,15 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
try:
|
||||
if 'javdb.com' not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None
|
||||
return None, None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.warning(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
@ -40,7 +46,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
@ -51,7 +57,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
@ -78,6 +84,21 @@ def url_page_num(href):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
|
||||
def parse_avatar_image(soup):
|
||||
try:
|
||||
span = soup.find("span", class_="avatar")
|
||||
if not span:
|
||||
return "" # 没有找到 <span> 元素,返回空字符串
|
||||
|
||||
style = span.get("style", "")
|
||||
match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
|
||||
return match.group(1) if match else "" # 解析成功返回 URL,否则返回空字符串
|
||||
except Exception as e:
|
||||
return "" # 发生异常时,返回空字符串
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actors_uncensored(soup, href):
|
||||
div_actors = soup.find("div", id='actors')
|
||||
@ -123,6 +144,29 @@ def parse_actors_uncensored(soup, href):
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actor_detail(soup, href):
|
||||
# 先找一下别名
|
||||
alias_list = []
|
||||
|
||||
div_meta = soup.find('span', class_='actor-section-name')
|
||||
if not div_meta:
|
||||
logging.warning(f'warning: no meta data found in page {href}')
|
||||
return None, None
|
||||
alias_div = soup.find('div', class_='column section-title')
|
||||
|
||||
if alias_div:
|
||||
meta_list = alias_div.find_all('span', class_='section-meta')
|
||||
if len(meta_list) > 1:
|
||||
alias_list = meta_list[0].text.strip().split(", ")
|
||||
|
||||
# 头像
|
||||
pic = ''
|
||||
avatar = soup.find("div", class_="column actor-avatar")
|
||||
if avatar:
|
||||
pic = parse_avatar_image(avatar)
|
||||
|
||||
# 返回数据
|
||||
actor = {}
|
||||
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
@ -157,7 +201,13 @@ def parse_actor_detail(soup, href):
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
actor = {
|
||||
'pic' : pic,
|
||||
'alias' : alias_list,
|
||||
'movies' : list_data
|
||||
}
|
||||
|
||||
return actor, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
@ -257,7 +307,7 @@ def parse_series_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
@ -337,7 +387,7 @@ def parse_maker_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
@ -17,21 +17,42 @@ def get_id_by_href(table: str, href: str) -> int:
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
# 插入演员索引,来自于列表数据
|
||||
def insert_actor_index(name, href):
|
||||
|
||||
def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO javdb_actors (href, name) VALUES (?, ?)
|
||||
""", (
|
||||
href, name
|
||||
))
|
||||
# **查询是否已存在该演员**
|
||||
cursor.execute("SELECT id, name, from_actor_list, from_movie_list FROM javdb_actors WHERE href = ?", (href,))
|
||||
existing_actor = cursor.fetchone()
|
||||
|
||||
if existing_actor: # **如果演员已存在**
|
||||
actor_id, existing_name, existing_actor_list, existing_movie_list = existing_actor
|
||||
|
||||
# **如果没有传入值,则保持原有值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor_list
|
||||
from_movie_list = from_movie_list if from_movie_list is not None else existing_movie_list
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_actors
|
||||
SET name = ?,
|
||||
from_actor_list = ?,
|
||||
from_movie_list = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (name, from_actor_list, from_movie_list, href))
|
||||
else: # **如果演员不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_actors (href, name, from_actor_list, from_movie_list)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (href, name, from_actor_list, from_movie_list))
|
||||
|
||||
conn.commit()
|
||||
|
||||
performer_id = get_id_by_href('javdb_actors', href)
|
||||
if performer_id:
|
||||
logging.debug(f'insert one actor index, id: {performer_id}, name: {name}, href: {href}')
|
||||
logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
|
||||
|
||||
return performer_id
|
||||
|
||||
return performer_id
|
||||
except sqlite3.Error as e:
|
||||
conn.rollback()
|
||||
logging.error(f"数据库错误: {e}")
|
||||
@ -41,28 +62,49 @@ def insert_actor_index(name, href):
|
||||
logging.error(f"未知错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# """插入电影索引,来自于列表数据"""
|
||||
def insert_movie_index(title, href):
|
||||
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None):
|
||||
try:
|
||||
# 插入或更新电影信息
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO javdb_movies (title, href) VALUES (?, ?)
|
||||
""",
|
||||
(title, href)
|
||||
)
|
||||
# **先检查数据库中是否已有该电影**
|
||||
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series FROM javdb_movies WHERE href = ?", (href,))
|
||||
existing_movie = cursor.fetchone()
|
||||
|
||||
if existing_movie: # **如果电影已存在**
|
||||
movie_id, existing_actor, existing_maker, existing_series = existing_movie
|
||||
|
||||
# **如果没有传入值,就用原来的值**
|
||||
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
|
||||
from_movie_makers = from_movie_makers if from_movie_makers is not None else existing_maker
|
||||
from_movie_series = from_movie_series if from_movie_series is not None else existing_series
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE javdb_movies
|
||||
SET title = ?,
|
||||
from_actor_list = ?,
|
||||
from_movie_makers = ?,
|
||||
from_movie_series = ?,
|
||||
updated_at = datetime('now', 'localtime')
|
||||
WHERE href = ?
|
||||
""", (title, from_actor_list, from_movie_makers, from_movie_series, href))
|
||||
else: # **如果电影不存在,插入**
|
||||
cursor.execute("""
|
||||
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series)
|
||||
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
|
||||
""", (title, href, from_actor_list, from_movie_makers, from_movie_series))
|
||||
|
||||
conn.commit()
|
||||
|
||||
movie_id = get_id_by_href('javdb_movies', href)
|
||||
if movie_id:
|
||||
logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
|
||||
|
||||
return movie_id
|
||||
|
||||
return movie_id
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logging.error("Error inserting movie: %s", e)
|
||||
logging.error(f"Error inserting/updating movie: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
# 插入演员和电影的关联数据
|
||||
def insert_actor_movie(performer_id, movie_id, tags=''):
|
||||
try:
|
||||
@ -117,7 +159,7 @@ def insert_or_update_actor(actor):
|
||||
movie_id = get_id_by_href('javdb_movies', movie['href'])
|
||||
# 影片不存在,先插入
|
||||
if movie_id is None:
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'])
|
||||
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
|
||||
if movie_id:
|
||||
tmp_id = insert_actor_movie(actor_id, movie_id)
|
||||
if tmp_id :
|
||||
@ -369,7 +411,7 @@ def insert_or_update_movie(movie):
|
||||
performer_id = get_id_by_href('javdb_actors', performer['href'])
|
||||
# 如果演员不存在,先插入
|
||||
if performer_id is None:
|
||||
performer_id = insert_actor_index(performer['name'], performer['href'])
|
||||
performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
|
||||
if performer_id:
|
||||
tmp_id = insert_actor_movie(performer_id, movie_id)
|
||||
if tmp_id:
|
||||
@ -465,7 +507,13 @@ def insert_task_log():
|
||||
INSERT INTO javdb_task_log (task_status) VALUES ('Start')
|
||||
""")
|
||||
conn.commit()
|
||||
return cursor.lastrowid # 获取插入的 task_id
|
||||
|
||||
task_id = cursor.lastrowid
|
||||
if task_id is None:
|
||||
return None
|
||||
update_task_log(task_id=task_id, task_status='Start')
|
||||
|
||||
return task_id # 获取插入的 task_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"插入任务失败: {e}")
|
||||
return None
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
|
||||
|
||||
# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
|
||||
def remove_url_query(url: str) -> str:
|
||||
try:
|
||||
parsed_url = urlparse(url)
|
||||
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
return clean_url
|
||||
except Exception as e:
|
||||
print(f"解析 URL 失败: {e}")
|
||||
return url
|
||||
|
||||
Reference in New Issue
Block a user