diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py index c8262d1..6b3b9c8 100644 --- a/javdb/src/fetch.py +++ b/javdb/src/fetch.py @@ -161,6 +161,42 @@ def fetch_movies_by_series(): if debug: return True +# 更新series列表中的影片信息 +def fetch_movies_by_publishers(): + if fast_mode: + url_list = db_tools.query_publishers_hrefs(from_list=1) + else: + url_list = db_tools.query_publishers_hrefs() + + if debug: + url_list = db_tools.query_publishers_hrefs(limit=1) + for row in url_list: + url = row['href'] + row_id = row['id'] + # 去掉可下载的标志(如果有) + next_url = utils.remove_url_query(url) + while next_url: + logging.info(f"Fetching data for publisher url {next_url} ...") + soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class")) + if soup: + list_data, next_url = scraper.parse_publisher_detail(soup, next_url) + if list_data: + for movie in list_data: + tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id) + if tmp_id: + logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}') + else: + logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}') + else : + logging.warning(f'parse_page_movie error. url: {next_url}') + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') + break + + # 调试增加brak + if debug: + return True + # 更新演员信息 def fetch_performers_detail(): @@ -316,6 +352,7 @@ function_map = { "series_list": fetch_series_list, "makers": fetch_movies_by_maker, "series" : fetch_movies_by_series, + "pub" : fetch_movies_by_publishers, "movies" : fetch_movies_detail, "actors" : fetch_performers_detail, } diff --git a/javdb/src/scraper.py b/javdb/src/scraper.py index 86cf265..c9d51fd 100644 --- a/javdb/src/scraper.py +++ b/javdb/src/scraper.py @@ -355,6 +355,7 @@ def parse_movie_detail(soup, href, title): # 获取maker,系列 result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:']) result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:']) + result['pub_name'], result['pub_link'] = parse_movie_val_href(soup, ['發行:', 'Publisher:']) # 获取演员,tags result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:']) @@ -523,10 +524,49 @@ def parse_maker_detail(soup, href): return list_data, next_url +# 解析 HTML 内容,提取需要的数据 +def parse_publisher_detail(soup, href): + #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5') + div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)')) + if not div_movies: + logging.warning(f"Warning: No movies div found ") + return [], None + + # 解析元素 + rows = div_movies.find_all('div', class_='item') + + list_data = [] + next_url = None + for row in rows: + link = row.find('a', class_='box')['href'] + serial_number = row.find('strong').text.strip() + title = row.find('div', class_='video-title').text.strip() + release_date = row.find('div', class_='meta').text.strip() + list_data.append({ + 'href' : host_url + link if link else '', + 'serial_number' : serial_number, + 'title' : title, + 'release_date': release_date + }) + + # 查找 "下一页" 按钮 + next_page_element = soup.find('a', class_='pagination-next') + if next_page_element: + next_page_url = next_page_element['href'] + next_page_number = url_page_num(next_page_url) + current_page_number = url_page_num(href) + if current_page_number is None: + current_page_number = 0 + if next_page_number and next_page_number > current_page_number : + next_url = host_url + next_page_url + + return list_data, next_url + # 解析 HTML 内容,提取需要的数据 def parse_uncensored(soup, href): - div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8') + #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8') + div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)')) if not div_movies: logging.warning(f"Warning: No movies div found ") return [], None diff --git a/javdb/src/sqlite_utils.py b/javdb/src/sqlite_utils.py index 06bed7d..36f414a 100644 --- a/javdb/src/sqlite_utils.py +++ b/javdb/src/sqlite_utils.py @@ -64,14 +64,14 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None): logging.error(f"未知错误: {e}") return None -def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None): +def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None, maker_id=None, series_id=None, from_movie_publishers=None, pub_id=None): try: # **先检查数据库中是否已有该电影** - cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id FROM javdb_movies WHERE href = ?", (href,)) + cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id FROM javdb_movies WHERE href = ?", (href,)) existing_movie = cursor.fetchone() if existing_movie: # **如果电影已存在** - movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id = existing_movie + movie_id, existing_actor, existing_maker, existing_series, existing_maker_id, existing_series_id, existing_pub, existing_pub_id = existing_movie # **如果没有传入值,就用原来的值** from_actor_list = from_actor_list if from_actor_list is not None else existing_actor @@ -79,6 +79,8 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None from_movie_series = from_movie_series if from_movie_series is not None else existing_series maker_id = maker_id if maker_id is not None else existing_maker_id series_id = series_id if series_id is not None else existing_series_id + from_movie_publishers = from_movie_publishers if from_movie_publishers is not None else existing_pub + pub_id = pub_id if pub_id is not None else existing_pub_id cursor.execute(""" UPDATE javdb_movies @@ -88,14 +90,16 @@ def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None from_movie_series = ?, maker_id = ?, series_id = ?, + from_movie_publishers = ?, + pub_id = ?, updated_at = datetime('now', 'localtime') WHERE href = ? - """, (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, href)) + """, (title, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id, href)) else: # **如果电影不存在,插入** cursor.execute(""" - INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id) - VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0)) - """, (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id)) + INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id) + VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0)) + """, (title, href, from_actor_list, from_movie_makers, from_movie_series, maker_id, series_id, from_movie_publishers, pub_id)) conn.commit() @@ -165,6 +169,7 @@ def insert_or_update_actor(actor): movie_id = get_id_by_href('javdb_movies', movie['href']) # 影片不存在,先插入 if movie_id is None: + # TODO: from_actor_list 只标记无码女优的话,这里要修改,暂时不动 movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1) if movie_id: tmp_id = insert_actor_movie(actor_id, movie_id) @@ -470,6 +475,107 @@ def query_series_hrefs(**filters): logging.error(f"查询 href 失败: {e}") return None +# 插入或更新发行商 """ +def insert_or_update_publishers(data, caller='list'): + try: + if caller == 'list': + cursor.execute(""" + INSERT INTO javdb_publishers (name, href, from_list, updated_at) + VALUES (?, ? , 1, datetime('now', 'localtime')) + ON CONFLICT(href) DO UPDATE SET + name = excluded.name, + from_list = 1, + updated_at = datetime('now', 'localtime') + """, (data["name"], data["href"])) + conn.commit() + elif caller == 'movie': + cursor.execute(""" + INSERT INTO javdb_publishers (name, href, from_movie_list, updated_at) + VALUES (?, ? , 1, datetime('now', 'localtime')) + ON CONFLICT(href) DO UPDATE SET + name = excluded.name, + from_movie_list = 1, + updated_at = datetime('now', 'localtime') + """, (data["name"], data["href"])) + conn.commit() + else: + logging.warning(f"unexpected caller: {caller}") + return None + + # 获取 performer_id + cursor.execute("SELECT id FROM javdb_publishers WHERE href = ?", (data["href"],)) + dist_id = cursor.fetchone()[0] + if dist_id: + logging.debug(f"成功插入/更新发行商: {data['name']}") + return dist_id + else: + return None + except sqlite3.Error as e: + conn.rollback() + logging.error(f"数据库错误: {e}") + return None + +# 删除发行商(按 id 或 name) """ +def delete_publishers(identifier): + try: + if isinstance(identifier, int): + cursor.execute("DELETE FROM javdb_publishers WHERE id = ?", (identifier,)) + elif isinstance(identifier, str): + cursor.execute("DELETE FROM javdb_publishers WHERE name = ?", (identifier,)) + conn.commit() + logging.info(f"成功删除发行商: {identifier}") + except sqlite3.Error as e: + conn.rollback() + logging.error(f"删除失败: {e}") + +# 查询发行商(按 id 或 name) """ +def query_publishers(identifier): + try: + if isinstance(identifier, int): + cursor.execute("SELECT * FROM javdb_publishers WHERE id = ?", (identifier,)) + else: + cursor.execute("SELECT * FROM javdb_publishers WHERE name LIKE ?", (f"%{identifier}%",)) + + distributor = cursor.fetchone() + if distributor: + return dict(zip([desc[0] for desc in cursor.description], distributor)) + else: + logging.warning(f"未找到发行商: {identifier}") + return None + except sqlite3.Error as e: + logging.error(f"查询失败: {e}") + return None + +# 按条件查询 href 列表 +def query_publishers_hrefs(**filters): + try: + sql = "SELECT href, id FROM javdb_publishers WHERE 1=1" + params = [] + + if "id" in filters: + sql += " AND id = ?" + params.append(filters["id"]) + if "from_list" in filters: + sql += " AND from_list = ?" + params.append(filters["from_list"]) + if "url" in filters: + sql += " AND href = ?" + params.append(filters["href"]) + if "name" in filters: + sql += " AND name LIKE ?" + params.append(f"%{filters['name']}%") + if 'limit' in filters: + sql += " limit ?" + params.append(filters["limit"]) + + cursor.execute(sql, params) + #return [row[0] for row in cursor.fetchall()] # 链接使用小写 + return [{'href': row[0], 'id': row[1]} for row in cursor.fetchall()] + + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + # 插入或更新类别 """ def insert_or_update_tags(name, href): @@ -543,17 +649,20 @@ def insert_or_update_movie(movie): # 获取相关 ID makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None + pub_id = get_id_by_href('javdb_publishers', movie['pub_link']) if movie['pub_link'] else None # 如果不存在,插入 if makers_id is None and movie['maker_link']: makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}, caller='movie') if series_id is None and movie['series_link']: series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}, caller='movie') + if pub_id is None and movie['pub_link']: + pub_id = insert_or_update_publishers({'name' : movie.get('pub_name', ''), 'href' : movie.get('pub_link', '')}, caller='movie') cursor.execute(""" INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration, - maker_id, series_id, is_full_data, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime')) + maker_id, series_id, pub_id, is_full_data, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime')) ON CONFLICT(href) DO UPDATE SET title=excluded.title, cover_url=excluded.cover_url, @@ -562,10 +671,11 @@ def insert_or_update_movie(movie): duration=excluded.duration, maker_id=excluded.maker_id, series_id=excluded.series_id, + pub_id=excluded.pub_id, is_full_data=1, updated_at=datetime('now', 'localtime') """, (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'], - movie['release_date'], movie['duration'], makers_id, series_id)) + movie['release_date'], movie['duration'], makers_id, series_id, pub_id)) conn.commit()