modify scripts

This commit is contained in:
oscarz
2025-04-25 16:23:01 +08:00
parent 4c80e72a98
commit 40b5d3f99a
3 changed files with 198 additions and 11 deletions

View File

@ -161,6 +161,42 @@ def fetch_movies_by_series():
if debug: if debug:
return True return True
# 更新series列表中的影片信息
def fetch_movies_by_publishers():
if fast_mode:
url_list = db_tools.query_publishers_hrefs(from_list=1)
else:
url_list = db_tools.query_publishers_hrefs()
if debug:
url_list = db_tools.query_publishers_hrefs(limit=1)
for row in url_list:
url = row['href']
row_id = row['id']
# 去掉可下载的标志(如果有)
next_url = utils.remove_url_query(url)
while next_url:
logging.info(f"Fetching data for publisher url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_publisher_detail(soup, next_url)
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {next_url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 调试增加brak
if debug:
return True
# 更新演员信息 # 更新演员信息
def fetch_performers_detail(): def fetch_performers_detail():
@ -316,6 +352,7 @@ function_map = {
"series_list": fetch_series_list, "series_list": fetch_series_list,
"makers": fetch_movies_by_maker, "makers": fetch_movies_by_maker,
"series" : fetch_movies_by_series, "series" : fetch_movies_by_series,
"pub" : fetch_movies_by_publishers,
"movies" : fetch_movies_detail, "movies" : fetch_movies_detail,
"actors" : fetch_performers_detail, "actors" : fetch_performers_detail,
} }

View File

@ -355,6 +355,7 @@ def parse_movie_detail(soup, href, title):
# 获取maker系列 # 获取maker系列
result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:']) result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:'])
result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:']) result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
result['pub_name'], result['pub_link'] = parse_movie_val_href(soup, ['發行:', 'Publisher:'])
# 获取演员tags # 获取演员tags
result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:']) result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:'])
@ -523,10 +524,49 @@ def parse_maker_detail(soup, href):
return list_data, next_url return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_publisher_detail(soup, href):
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href' : host_url + link if link else '',
'serial_number' : serial_number,
'title' : title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
# 解析 HTML 内容,提取需要的数据 # 解析 HTML 内容,提取需要的数据
def parse_uncensored(soup, href): def parse_uncensored(soup, href):
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8') #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies: if not div_movies:
logging.warning(f"Warning: No movies div found ") logging.warning(f"Warning: No movies div found ")
return [], None return [], None

View File

@ -64,14 +64,14 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
logging.error(f"未知错误: {e}") logging.error(f"未知错误: {e}")
return None return None
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None): def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None, from_movie_publishers=None, pub_id=None):
try: try:
# **先检查数据库中是否已有该电影** # **先检查数据库中是否已有该电影**
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id FROM javdb_movies WHERE href = ?", (href,)) cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id FROM javdb_movies WHERE href = ?", (href,))
existing_movie = cursor.fetchone() existing_movie = cursor.fetchone()
if existing_movie: # **如果电影已存在** if existing_movie: # **如果电影已存在**
movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id = existing_movie movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id, existing_pub, existing_pub_id = existing_movie
# **如果没有传入值,就用原来的值** # **如果没有传入值,就用原来的值**
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
@ -79,6 +79,8 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None
from_movie_series = from_movie_series if from_movie_series is not None else existing_series from_movie_series = from_movie_series if from_movie_series is not None else existing_series
maker_id = maker_id if maker_id is not None else existing_maker_id maker_id = maker_id if maker_id is not None else existing_maker_id
series_id = series_id if series_id is not None else existing_series_id series_id = series_id if series_id is not None else existing_series_id
from_movie_publishers = from_movie_publishers if from_movie_publishers is not None else existing_pub
pub_id = pub_id if pub_id is not None else existing_pub_id
cursor.execute(""" cursor.execute("""
UPDATE javdb_movies UPDATE javdb_movies
@ -88,14 +90,16 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None
from_movie_series = ?, from_movie_series = ?,
maker_id = ?, maker_id = ?,
series_id = ?, series_id = ?,
from_movie_publishers = ?,
pub_id = ?,
updated_at = datetime('now', 'localtime') updated_at = datetime('now', 'localtime')
WHERE href = ? WHERE href = ?
""", (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, href)) """, (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id, href))
else: # **如果电影不存在,插入** else: # **如果电影不存在,插入**
cursor.execute(""" cursor.execute("""
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id) INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id)
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0)) VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
""", (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id)) """, (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id))
conn.commit() conn.commit()
@ -165,6 +169,7 @@ def insert_or_update_actor(actor):
movie_id = get_id_by_href('javdb_movies', movie['href']) movie_id = get_id_by_href('javdb_movies', movie['href'])
# 影片不存在,先插入 # 影片不存在,先插入
if movie_id is None: if movie_id is None:
# TODO: from_actor_list 只标记无码女优的话,这里要修改,暂时不动
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1) movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
if movie_id: if movie_id:
tmp_id = insert_actor_movie(actor_id, movie_id) tmp_id = insert_actor_movie(actor_id, movie_id)
@ -470,6 +475,107 @@ def query_series_hrefs(**filters):
logging.error(f"查询 href 失败: {e}") logging.error(f"查询 href 失败: {e}")
return None return None
# 插入或更新发行商 """
def insert_or_update_publishers(data, caller='list'):
try:
if caller == 'list':
cursor.execute("""
INSERT INTO javdb_publishers (name, href, from_list, updated_at)
VALUES (?, ? , 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
from_list = 1,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
elif caller == 'movie':
cursor.execute("""
INSERT INTO javdb_publishers (name, href, from_movie_list, updated_at)
VALUES (?, ? , 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
from_movie_list = 1,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
else:
logging.warning(f"unexpected caller: {caller}")
return None
# 获取 performer_id
cursor.execute("SELECT id FROM javdb_publishers WHERE href = ?", (data["href"],))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# 删除发行商(按 id 或 name """
def delete_publishers(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM javdb_publishers WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM javdb_publishers WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除发行商: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 查询发行商(按 id 或 name """
def query_publishers(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM javdb_publishers WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM javdb_publishers WHERE name LIKE ?", (f"%{identifier}%",))
distributor = cursor.fetchone()
if distributor:
return dict(zip([desc[0] for desc in cursor.description], distributor))
else:
logging.warning(f"未找到发行商: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_publishers_hrefs(**filters):
try:
sql = "SELECT href, id FROM javdb_publishers WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "from_list" in filters:
sql += " AND from_list = ?"
params.append(filters["from_list"])
if "url" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
#return [row[0] for row in cursor.fetchall()] # 链接使用小写
return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新类别 """ # 插入或更新类别 """
def insert_or_update_tags(name, href): def insert_or_update_tags(name, href):
@ -543,17 +649,20 @@ def insert_or_update_movie(movie):
# 获取相关 ID # 获取相关 ID
makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None
series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None
pub_id = get_id_by_href('javdb_publishers', movie['pub_link']) if movie['pub_link'] else None
# 如果不存在,插入 # 如果不存在,插入
if makers_id is None and movie['maker_link']: if makers_id is None and movie['maker_link']:
makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}, caller='movie') makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}, caller='movie')
if series_id is None and movie['series_link']: if series_id is None and movie['series_link']:
series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}, caller='movie') series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}, caller='movie')
if pub_id is None and movie['pub_link']:
pub_id = insert_or_update_publishers({'name' : movie.get('pub_name', ''), 'href' : movie.get('pub_link', '')}, caller='movie')
cursor.execute(""" cursor.execute("""
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration, INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
maker_id, series_id, is_full_data, updated_at) maker_id, series_id, pub_id, is_full_data, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime')) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET ON CONFLICT(href) DO UPDATE SET
title=excluded.title, title=excluded.title,
cover_url=excluded.cover_url, cover_url=excluded.cover_url,
@ -562,10 +671,11 @@ def insert_or_update_movie(movie):
duration=excluded.duration, duration=excluded.duration,
maker_id=excluded.maker_id, maker_id=excluded.maker_id,
series_id=excluded.series_id, series_id=excluded.series_id,
pub_id=excluded.pub_id,
is_full_data=1, is_full_data=1,
updated_at=datetime('now', 'localtime') updated_at=datetime('now', 'localtime')
""", (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'], """, (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
movie['release_date'], movie['duration'], makers_id, series_id)) movie['release_date'], movie['duration'], makers_id, series_id, pub_id))
conn.commit() conn.commit()