modify scripts

This commit is contained in:
oscarz
2025-04-25 16:23:01 +08:00
parent 4c80e72a98
commit 40b5d3f99a
3 changed files with 198 additions and 11 deletions

View File

@ -161,6 +161,42 @@ def fetch_movies_by_series():
if debug:
return True
# 更新series列表中的影片信息
def fetch_movies_by_publishers():
if fast_mode:
url_list = db_tools.query_publishers_hrefs(from_list=1)
else:
url_list = db_tools.query_publishers_hrefs()
if debug:
url_list = db_tools.query_publishers_hrefs(limit=1)
for row in url_list:
url = row['href']
row_id = row['id']
# 去掉可下载的标志(如果有)
next_url = utils.remove_url_query(url)
while next_url:
logging.info(f"Fetching data for publisher url {next_url} ...")
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class"))
if soup:
list_data, next_url = scraper.parse_publisher_detail(soup, next_url)
if list_data:
for movie in list_data:
tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id)
if tmp_id:
logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
else:
logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
else :
logging.warning(f'parse_page_movie error. url: {next_url}')
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
break
# 调试增加brak
if debug:
return True
# 更新演员信息
def fetch_performers_detail():
@ -316,6 +352,7 @@ function_map = {
"series_list": fetch_series_list,
"makers": fetch_movies_by_maker,
"series" : fetch_movies_by_series,
"pub" : fetch_movies_by_publishers,
"movies" : fetch_movies_detail,
"actors" : fetch_performers_detail,
}

View File

@ -355,6 +355,7 @@ def parse_movie_detail(soup, href, title):
# 获取maker系列
result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:'])
result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
result['pub_name'], result['pub_link'] = parse_movie_val_href(soup, ['發行:', 'Publisher:'])
# 获取演员tags
result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:'])
@ -523,10 +524,49 @@ def parse_maker_detail(soup, href):
return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_publisher_detail(soup, href):
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href' : host_url + link if link else '',
'serial_number' : serial_number,
'title' : title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = url_page_num(next_page_url)
current_page_number = url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number :
next_url = host_url + next_page_url
return list_data, next_url
# 解析 HTML 内容,提取需要的数据
def parse_uncensored(soup, href):
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
#div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None

View File

@ -64,14 +64,14 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
logging.error(f"未知错误: {e}")
return None
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None):
def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None, from_movie_publishers=None, pub_id=None):
try:
# **先检查数据库中是否已有该电影**
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id FROM javdb_movies WHERE href = ?", (href,))
cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id FROM javdb_movies WHERE href = ?", (href,))
existing_movie = cursor.fetchone()
if existing_movie: # **如果电影已存在**
movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id = existing_movie
movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id, existing_pub, existing_pub_id = existing_movie
# **如果没有传入值,就用原来的值**
from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
@ -79,6 +79,8 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None
from_movie_series = from_movie_series if from_movie_series is not None else existing_series
maker_id = maker_id if maker_id is not None else existing_maker_id
series_id = series_id if series_id is not None else existing_series_id
from_movie_publishers = from_movie_publishers if from_movie_publishers is not None else existing_pub
pub_id = pub_id if pub_id is not None else existing_pub_id
cursor.execute("""
UPDATE javdb_movies
@ -88,14 +90,16 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None
from_movie_series = ?,
maker_id = ?,
series_id = ?,
from_movie_publishers = ?,
pub_id = ?,
updated_at = datetime('now', 'localtime')
WHERE href = ?
""", (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, href))
""", (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id, href))
else: # **如果电影不存在,插入**
cursor.execute("""
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id)
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
""", (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id))
INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id)
VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
""", (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id))
conn.commit()
@ -165,6 +169,7 @@ def insert_or_update_actor(actor):
movie_id = get_id_by_href('javdb_movies', movie['href'])
# 影片不存在,先插入
if movie_id is None:
# TODO: from_actor_list 只标记无码女优的话,这里要修改,暂时不动
movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
if movie_id:
tmp_id = insert_actor_movie(actor_id, movie_id)
@ -470,6 +475,107 @@ def query_series_hrefs(**filters):
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新发行商 """
def insert_or_update_publishers(data, caller='list'):
try:
if caller == 'list':
cursor.execute("""
INSERT INTO javdb_publishers (name, href, from_list, updated_at)
VALUES (?, ? , 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
from_list = 1,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
elif caller == 'movie':
cursor.execute("""
INSERT INTO javdb_publishers (name, href, from_movie_list, updated_at)
VALUES (?, ? , 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
from_movie_list = 1,
updated_at = datetime('now', 'localtime')
""", (data["name"], data["href"]))
conn.commit()
else:
logging.warning(f"unexpected caller: {caller}")
return None
# 获取 performer_id
cursor.execute("SELECT id FROM javdb_publishers WHERE href = ?", (data["href"],))
dist_id = cursor.fetchone()[0]
if dist_id:
logging.debug(f"成功插入/更新发行商: {data['name']}")
return dist_id
else:
return None
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
# 删除发行商(按 id 或 name """
def delete_publishers(identifier):
try:
if isinstance(identifier, int):
cursor.execute("DELETE FROM javdb_publishers WHERE id = ?", (identifier,))
elif isinstance(identifier, str):
cursor.execute("DELETE FROM javdb_publishers WHERE name = ?", (identifier,))
conn.commit()
logging.info(f"成功删除发行商: {identifier}")
except sqlite3.Error as e:
conn.rollback()
logging.error(f"删除失败: {e}")
# 查询发行商(按 id 或 name """
def query_publishers(identifier):
try:
if isinstance(identifier, int):
cursor.execute("SELECT * FROM javdb_publishers WHERE id = ?", (identifier,))
else:
cursor.execute("SELECT * FROM javdb_publishers WHERE name LIKE ?", (f"%{identifier}%",))
distributor = cursor.fetchone()
if distributor:
return dict(zip([desc[0] for desc in cursor.description], distributor))
else:
logging.warning(f"未找到发行商: {identifier}")
return None
except sqlite3.Error as e:
logging.error(f"查询失败: {e}")
return None
# 按条件查询 href 列表
def query_publishers_hrefs(**filters):
try:
sql = "SELECT href, id FROM javdb_publishers WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "from_list" in filters:
sql += " AND from_list = ?"
params.append(filters["from_list"])
if "url" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
#return [row[0] for row in cursor.fetchall()] # 链接使用小写
return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入或更新类别 """
def insert_or_update_tags(name, href):
@ -543,17 +649,20 @@ def insert_or_update_movie(movie):
# 获取相关 ID
makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None
series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None
pub_id = get_id_by_href('javdb_publishers', movie['pub_link']) if movie['pub_link'] else None
# 如果不存在,插入
if makers_id is None and movie['maker_link']:
makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}, caller='movie')
if series_id is None and movie['series_link']:
series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}, caller='movie')
if pub_id is None and movie['pub_link']:
pub_id = insert_or_update_publishers({'name' : movie.get('pub_name', ''), 'href' : movie.get('pub_link', '')}, caller='movie')
cursor.execute("""
INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration,
maker_id, series_id, is_full_data, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
maker_id, series_id, pub_id, is_full_data, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title,
cover_url=excluded.cover_url,
@ -562,10 +671,11 @@ def insert_or_update_movie(movie):
duration=excluded.duration,
maker_id=excluded.maker_id,
series_id=excluded.series_id,
pub_id=excluded.pub_id,
is_full_data=1,
updated_at=datetime('now', 'localtime')
""", (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
movie['release_date'], movie['duration'], makers_id, series_id))
movie['release_date'], movie['duration'], makers_id, series_id, pub_id))
conn.commit()