modify scripts

This commit is contained in:
oscarz
2025-07-05 16:47:46 +08:00
parent 95b4d8b414
commit 02334a8005
8 changed files with 144 additions and 93 deletions

View File

@ -0,0 +1,17 @@
#
#
#
#
#
#
SPIDER_NAME_SIS = 'sis'
SPIDER_NAME_U3C3 = 'u3c3'
SPIDER_NAME_IAFD = 'iafd'
ITEM_TYPE_LIST = 'list'
ITEM_TYPE_MOVIE_INDEX = 'movie_index'
ITEM_TYPE_ACTOR_INDEX = 'actor_index'
ITEM_TYPE_MOVIE_DETAIL = 'movie_detail'
ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'

View File

@ -2,16 +2,85 @@ import os
import sqlite3
import logging
from datetime import datetime
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
import scrapy_proj.comm.comm_def as comm
# 注册器字典
spider_handler_registry = {}
def register_handler(spider_name):
def decorator(cls):
spider_handler_registry[spider_name.lower()] = cls
return cls
return decorator
@register_handler(comm.SPIDER_NAME_SIS)
class SisDBHandler(SQLiteDBHandler):
def __init__(self, db_path=default_dbpath):
super().__init__(db_path)
self.tbl_name_sis = 'sis'
def insert_item(self, item):
self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
def _create_tables(self):
# 创建 sis001 数据表
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
plate_name TEXT,
title TEXT,
url TEXT UNIQUE,
size_text TEXT,
size_gb REAL,
update_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
)
''')
self.conn.commit()
class IAFDQuery(SQLiteDBHandler):
@register_handler(comm.SPIDER_NAME_U3C3)
class U3C3DBHandler(SQLiteDBHandler):
def __init__(self, db_path=default_dbpath):
super().__init__(db_path)
self.tbl_name_u3c3 = 'sis'
def insert_item(self, item):
self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
def _create_tables(self):
# 创建 u001 数据表
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
category TEXT,
title TEXT,
url TEXT UNIQUE,
torrent_url TEXT,
magnet_url TEXT,
size_text TEXT,
size_gb REAL,
update_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
)
''')
self.conn.commit()
@register_handler(comm.SPIDER_NAME_IAFD)
class IAFDDBHandler(SQLiteDBHandler):
def __init__(self, db_path=shared_db_path):
super().__init__(db_path)
self.tbl_name_performers = 'iafd_performers'
self.tbl_name_movies = 'iafd_movies'
self.uniq_key = 'href'
def insert_item(self, item):
pass
# 按条件查询 href 列表
def get_performers(self, **filters):
try:

View File

@ -31,6 +31,13 @@ class SQLiteDBHandler:
if sqlite_version < (3, 24, 0):
self.lower_sqlite_version = True
def _create_tables(self):
pass
# 接口函数,必须在各个子类中实现
def insert_item(self, item):
raise NotImplementedError("子类必须实现 insert_item 方法")
def get_table_columns_and_defaults(self, tbl_name):
try:
self.cursor.execute(f"PRAGMA table_info({tbl_name})")

View File

@ -5,9 +5,11 @@
# items.py
import scrapy
import scrapy_proj.comm.comm_def as comm
# u3c3.in
class U001Item(scrapy.Item):
item_tpye = comm.ITEM_TYPE_LIST
category = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
@ -19,6 +21,7 @@ class U001Item(scrapy.Item):
# sis001.com
class Sis001Item(scrapy.Item):
item_tpye = comm.ITEM_TYPE_LIST
title = scrapy.Field()
url = scrapy.Field()
plate_name = scrapy.Field()
@ -27,6 +30,7 @@ class Sis001Item(scrapy.Item):
update_date = scrapy.Field()
class IAFDPersonItem(scrapy.Item):
item_tpye = comm.ITEM_TYPE_ACTOR_INDEX
name = scrapy.Field()
href = scrapy.Field()
from_astro_list = scrapy.Field()
@ -35,6 +39,7 @@ class IAFDPersonItem(scrapy.Item):
from_movie_list = scrapy.Field()
class IAFDMovieItem(scrapy.Item):
item_tpye = comm.ITEM_TYPE_MOVIE_INDEX
title = scrapy.Field()
href = scrapy.Field()
release_year = scrapy.Field()
@ -43,6 +48,7 @@ class IAFDMovieItem(scrapy.Item):
from_stu_list = scrapy.Field()
class IAFDPersonDetailItem(scrapy.Item):
item_tpye = comm.ITEM_TYPE_ACTOR_DETAIL
href = scrapy.Field()
person = scrapy.Field()
gender = scrapy.Field()
@ -67,6 +73,7 @@ class IAFDPersonDetailItem(scrapy.Item):
performer_aka = scrapy.Field()
class IAFDMovieDetailItem(scrapy.Item):
item_tpye = comm.ITEM_TYPE_MOVIE_DETAIL
title = scrapy.Field()
href = scrapy.Field()
# 可以根据实际需求添加更多影片详情字段

View File

@ -9,91 +9,39 @@
#class ScrapyProjPipeline:
# def process_item(self, item, spider):
# return item
import os
import sqlite3
import json
import scrapy
import logging
from datetime import datetime
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler
class SQLitePipeline(SQLiteDBHandler):
def __init__(self, db_path=None):
super().__init__(db_path)
self.tbl_name_u3c3 = 'u3c3'
self.tbl_name_sis = 'sis'
self._create_tables()
def _create_tables(self):
# 创建 u001 数据表
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
category TEXT,
title TEXT,
url TEXT UNIQUE,
torrent_url TEXT,
magnet_url TEXT,
size_text TEXT,
size_gb REAL,
update_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
)
''')
class SQLitePipeline():
def __init__(self):
self.db_handlers = {}
def open_spider(self, spider):
spider_name = spider.name.lower()
handler_class = spider_handler_registry.get(spider_name)
if not handler_class:
raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
self.db_handlers[spider_name] = handler_class()
# 创建 sis001 数据表
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
plate_name TEXT,
title TEXT,
url TEXT UNIQUE,
size_text TEXT,
size_gb REAL,
update_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
)
''')
self.conn.commit()
def process_item(self, item, spider):
if isinstance(item, U001Item):
self._process_u001_item(item, spider)
elif isinstance(item, Sis001Item):
self._process_sis001_item(item, spider)
elif isinstance(item, IAFDPersonItem):
self._process_iafd_person_item(item, spider)
elif isinstance(item, IAFDPersonDetailItem):
self._process_iafd_person_detail_item(item, spider)
elif isinstance(item, IAFDMovieItem):
self._process_iafd_movie_item(item, spider)
elif isinstance(item, IAFDMovieDetailItem):
self._process_iafd_movie_detail_item(item, spider)
return item
def _process_u001_item(self, item, spider):
spider.logger.debug(f"insert one item. href:{spider.name}")
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
def _process_sis001_item(self, item, spider):
spider.logger.debug(f"insert one item. href:{spider.name}")
return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
def _process_iafd_person_item(self, item, spider):
spider.logger.debug(f"deal with persion item. {item}")
def _process_iafd_movie_item(self, item, spider):
spider.logger.debug(f"deal with movie item. {item}")
def _process_iafd_person_detail_item(self, item, spider):
spider.logger.debug(f"deal with persion item. {item}")
def _process_iafd_movie_detail_item(self, item, spider):
spider.logger.debug(f"deal with movie item. {item}")
def close_spider(self, spider):
self.conn.close()
spider_name = spider.name.lower()
handler = self.db_handlers.pop(spider_name, None)
if handler:
handler.close()
def process_item(self, item, spider):
spider_name = spider.name.lower()
handler = self.db_handlers.get(spider_name)
if not handler:
raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
# 转换为单行JSON格式
item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
handler.insert_item(item)

View File

@ -2,12 +2,13 @@ import scrapy
import re
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
db_tools = IAFDQuery()
db_tools = IAFDDBHandler()
class IAFDSpider(BaseSpider):
name = "iafd"
name = SPIDER_NAME_IAFD
allowed_domains = ["iafd.com"]
host_url = "https://www.iafd.com"

View File

@ -4,9 +4,10 @@ from urllib.parse import urljoin
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import Sis001Item
from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime
from scrapy_proj.comm.comm_def import SPIDER_NAME_SIS
class Sis001Spider(BaseSpider):
name = "sis"
name = SPIDER_NAME_SIS
allowed_domains = ["sis001.com"]
def __init__(self, debug='False', begin=None, *args, **kwargs):
@ -111,9 +112,9 @@ class Sis001Spider(BaseSpider):
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
up_date = parse_date_to_datetime(item['update_date'])
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) :
self.logger.debug(f"find early data.")
pass
else:
need_next = True

View File

@ -3,9 +3,10 @@ import scrapy
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import U001Item
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3
class U001Spider(BaseSpider):
name = "u3c3"
name = SPIDER_NAME_U3C3
allowed_domains = ["u001.25img.com"]
start_urls = ["https://u001.25img.com/?p=1"]
@ -37,9 +38,9 @@ class U001Spider(BaseSpider):
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
up_date = parse_date_to_datetime(item['update_date'])
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
self.logger.debug(f"find early data.")
pass
else:
need_next = True
yield item