modify scripts
This commit is contained in:
17
scrapy_proj/scrapy_proj/comm/comm_def.py
Normal file
17
scrapy_proj/scrapy_proj/comm/comm_def.py
Normal file
@ -0,0 +1,17 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
SPIDER_NAME_SIS = 'sis'
|
||||
SPIDER_NAME_U3C3 = 'u3c3'
|
||||
SPIDER_NAME_IAFD = 'iafd'
|
||||
|
||||
ITEM_TYPE_LIST = 'list'
|
||||
ITEM_TYPE_MOVIE_INDEX = 'movie_index'
|
||||
ITEM_TYPE_ACTOR_INDEX = 'actor_index'
|
||||
ITEM_TYPE_MOVIE_DETAIL = 'movie_detail'
|
||||
ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'
|
||||
@ -2,16 +2,85 @@ import os
|
||||
import sqlite3
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
|
||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
||||
import scrapy_proj.comm.comm_def as comm
|
||||
|
||||
# 注册器字典
|
||||
spider_handler_registry = {}
|
||||
|
||||
def register_handler(spider_name):
|
||||
def decorator(cls):
|
||||
spider_handler_registry[spider_name.lower()] = cls
|
||||
return cls
|
||||
return decorator
|
||||
|
||||
@register_handler(comm.SPIDER_NAME_SIS)
|
||||
class SisDBHandler(SQLiteDBHandler):
|
||||
def __init__(self, db_path=default_dbpath):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_sis = 'sis'
|
||||
|
||||
def insert_item(self, item):
|
||||
self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
||||
|
||||
def _create_tables(self):
|
||||
# 创建 sis001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
plate_name TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
class IAFDQuery(SQLiteDBHandler):
|
||||
@register_handler(comm.SPIDER_NAME_U3C3)
|
||||
class U3C3DBHandler(SQLiteDBHandler):
|
||||
def __init__(self, db_path=default_dbpath):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_u3c3 = 'sis'
|
||||
|
||||
def insert_item(self, item):
|
||||
self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
||||
|
||||
def _create_tables(self):
|
||||
# 创建 u001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
category TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
torrent_url TEXT,
|
||||
magnet_url TEXT,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
@register_handler(comm.SPIDER_NAME_IAFD)
|
||||
class IAFDDBHandler(SQLiteDBHandler):
|
||||
def __init__(self, db_path=shared_db_path):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_performers = 'iafd_performers'
|
||||
self.tbl_name_movies = 'iafd_movies'
|
||||
self.uniq_key = 'href'
|
||||
|
||||
def insert_item(self, item):
|
||||
pass
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def get_performers(self, **filters):
|
||||
try:
|
||||
@ -31,6 +31,13 @@ class SQLiteDBHandler:
|
||||
if sqlite_version < (3, 24, 0):
|
||||
self.lower_sqlite_version = True
|
||||
|
||||
def _create_tables(self):
|
||||
pass
|
||||
|
||||
# 接口函数,必须在各个子类中实现
|
||||
def insert_item(self, item):
|
||||
raise NotImplementedError("子类必须实现 insert_item 方法")
|
||||
|
||||
def get_table_columns_and_defaults(self, tbl_name):
|
||||
try:
|
||||
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
||||
|
||||
@ -5,9 +5,11 @@
|
||||
|
||||
# items.py
|
||||
import scrapy
|
||||
import scrapy_proj.comm.comm_def as comm
|
||||
|
||||
# u3c3.in
|
||||
class U001Item(scrapy.Item):
|
||||
item_tpye = comm.ITEM_TYPE_LIST
|
||||
category = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
@ -19,6 +21,7 @@ class U001Item(scrapy.Item):
|
||||
|
||||
# sis001.com
|
||||
class Sis001Item(scrapy.Item):
|
||||
item_tpye = comm.ITEM_TYPE_LIST
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
plate_name = scrapy.Field()
|
||||
@ -27,6 +30,7 @@ class Sis001Item(scrapy.Item):
|
||||
update_date = scrapy.Field()
|
||||
|
||||
class IAFDPersonItem(scrapy.Item):
|
||||
item_tpye = comm.ITEM_TYPE_ACTOR_INDEX
|
||||
name = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
from_astro_list = scrapy.Field()
|
||||
@ -35,6 +39,7 @@ class IAFDPersonItem(scrapy.Item):
|
||||
from_movie_list = scrapy.Field()
|
||||
|
||||
class IAFDMovieItem(scrapy.Item):
|
||||
item_tpye = comm.ITEM_TYPE_MOVIE_INDEX
|
||||
title = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
release_year = scrapy.Field()
|
||||
@ -43,6 +48,7 @@ class IAFDMovieItem(scrapy.Item):
|
||||
from_stu_list = scrapy.Field()
|
||||
|
||||
class IAFDPersonDetailItem(scrapy.Item):
|
||||
item_tpye = comm.ITEM_TYPE_ACTOR_DETAIL
|
||||
href = scrapy.Field()
|
||||
person = scrapy.Field()
|
||||
gender = scrapy.Field()
|
||||
@ -67,6 +73,7 @@ class IAFDPersonDetailItem(scrapy.Item):
|
||||
performer_aka = scrapy.Field()
|
||||
|
||||
class IAFDMovieDetailItem(scrapy.Item):
|
||||
item_tpye = comm.ITEM_TYPE_MOVIE_DETAIL
|
||||
title = scrapy.Field()
|
||||
href = scrapy.Field()
|
||||
# 可以根据实际需求添加更多影片详情字段
|
||||
@ -9,91 +9,39 @@
|
||||
#class ScrapyProjPipeline:
|
||||
# def process_item(self, item, spider):
|
||||
# return item
|
||||
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import json
|
||||
import scrapy
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
|
||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
|
||||
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler
|
||||
|
||||
class SQLitePipeline(SQLiteDBHandler):
|
||||
def __init__(self, db_path=None):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_u3c3 = 'u3c3'
|
||||
self.tbl_name_sis = 'sis'
|
||||
self._create_tables()
|
||||
|
||||
def _create_tables(self):
|
||||
# 创建 u001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
category TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
torrent_url TEXT,
|
||||
magnet_url TEXT,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
class SQLitePipeline():
|
||||
def __init__(self):
|
||||
self.db_handlers = {}
|
||||
|
||||
def open_spider(self, spider):
|
||||
spider_name = spider.name.lower()
|
||||
handler_class = spider_handler_registry.get(spider_name)
|
||||
if not handler_class:
|
||||
raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
|
||||
self.db_handlers[spider_name] = handler_class()
|
||||
|
||||
# 创建 sis001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
plate_name TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if isinstance(item, U001Item):
|
||||
self._process_u001_item(item, spider)
|
||||
elif isinstance(item, Sis001Item):
|
||||
self._process_sis001_item(item, spider)
|
||||
elif isinstance(item, IAFDPersonItem):
|
||||
self._process_iafd_person_item(item, spider)
|
||||
elif isinstance(item, IAFDPersonDetailItem):
|
||||
self._process_iafd_person_detail_item(item, spider)
|
||||
elif isinstance(item, IAFDMovieItem):
|
||||
self._process_iafd_movie_item(item, spider)
|
||||
elif isinstance(item, IAFDMovieDetailItem):
|
||||
self._process_iafd_movie_detail_item(item, spider)
|
||||
return item
|
||||
|
||||
def _process_u001_item(self, item, spider):
|
||||
spider.logger.debug(f"insert one item. href:{spider.name}")
|
||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
||||
|
||||
def _process_sis001_item(self, item, spider):
|
||||
spider.logger.debug(f"insert one item. href:{spider.name}")
|
||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
||||
|
||||
def _process_iafd_person_item(self, item, spider):
|
||||
spider.logger.debug(f"deal with persion item. {item}")
|
||||
|
||||
def _process_iafd_movie_item(self, item, spider):
|
||||
spider.logger.debug(f"deal with movie item. {item}")
|
||||
|
||||
def _process_iafd_person_detail_item(self, item, spider):
|
||||
spider.logger.debug(f"deal with persion item. {item}")
|
||||
|
||||
def _process_iafd_movie_detail_item(self, item, spider):
|
||||
spider.logger.debug(f"deal with movie item. {item}")
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.conn.close()
|
||||
spider_name = spider.name.lower()
|
||||
handler = self.db_handlers.pop(spider_name, None)
|
||||
if handler:
|
||||
handler.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
spider_name = spider.name.lower()
|
||||
handler = self.db_handlers.get(spider_name)
|
||||
|
||||
if not handler:
|
||||
raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
|
||||
|
||||
# 转换为单行JSON格式
|
||||
item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
|
||||
spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
|
||||
|
||||
handler.insert_item(item)
|
||||
|
||||
|
||||
@ -2,12 +2,13 @@ import scrapy
|
||||
import re
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
|
||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||||
|
||||
db_tools = IAFDQuery()
|
||||
db_tools = IAFDDBHandler()
|
||||
|
||||
class IAFDSpider(BaseSpider):
|
||||
name = "iafd"
|
||||
name = SPIDER_NAME_IAFD
|
||||
allowed_domains = ["iafd.com"]
|
||||
|
||||
host_url = "https://www.iafd.com"
|
||||
|
||||
@ -4,9 +4,10 @@ from urllib.parse import urljoin
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||
from scrapy_proj.items import Sis001Item
|
||||
from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_SIS
|
||||
|
||||
class Sis001Spider(BaseSpider):
|
||||
name = "sis"
|
||||
name = SPIDER_NAME_SIS
|
||||
allowed_domains = ["sis001.com"]
|
||||
|
||||
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||
@ -111,9 +112,9 @@ class Sis001Spider(BaseSpider):
|
||||
|
||||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||
up_date = parse_date_to_datetime(item['update_date'])
|
||||
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) :
|
||||
self.logger.debug(f"find early data.")
|
||||
pass
|
||||
else:
|
||||
need_next = True
|
||||
|
||||
|
||||
@ -3,9 +3,10 @@ import scrapy
|
||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||
from scrapy_proj.items import U001Item
|
||||
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3
|
||||
|
||||
class U001Spider(BaseSpider):
|
||||
name = "u3c3"
|
||||
name = SPIDER_NAME_U3C3
|
||||
allowed_domains = ["u001.25img.com"]
|
||||
start_urls = ["https://u001.25img.com/?p=1"]
|
||||
|
||||
@ -37,9 +38,9 @@ class U001Spider(BaseSpider):
|
||||
|
||||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||
up_date = parse_date_to_datetime(item['update_date'])
|
||||
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
||||
self.logger.debug(f"find early data.")
|
||||
pass
|
||||
else:
|
||||
need_next = True
|
||||
yield item
|
||||
|
||||
Reference in New Issue
Block a user