modify scripts
This commit is contained in:
17
scrapy_proj/scrapy_proj/comm/comm_def.py
Normal file
17
scrapy_proj/scrapy_proj/comm/comm_def.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
SPIDER_NAME_SIS = 'sis'
|
||||||
|
SPIDER_NAME_U3C3 = 'u3c3'
|
||||||
|
SPIDER_NAME_IAFD = 'iafd'
|
||||||
|
|
||||||
|
ITEM_TYPE_LIST = 'list'
|
||||||
|
ITEM_TYPE_MOVIE_INDEX = 'movie_index'
|
||||||
|
ITEM_TYPE_ACTOR_INDEX = 'actor_index'
|
||||||
|
ITEM_TYPE_MOVIE_DETAIL = 'movie_detail'
|
||||||
|
ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'
|
||||||
@ -2,16 +2,85 @@ import os
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
|
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
||||||
|
import scrapy_proj.comm.comm_def as comm
|
||||||
|
|
||||||
|
# 注册器字典
|
||||||
|
spider_handler_registry = {}
|
||||||
|
|
||||||
|
def register_handler(spider_name):
|
||||||
|
def decorator(cls):
|
||||||
|
spider_handler_registry[spider_name.lower()] = cls
|
||||||
|
return cls
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
@register_handler(comm.SPIDER_NAME_SIS)
|
||||||
|
class SisDBHandler(SQLiteDBHandler):
|
||||||
|
def __init__(self, db_path=default_dbpath):
|
||||||
|
super().__init__(db_path)
|
||||||
|
self.tbl_name_sis = 'sis'
|
||||||
|
|
||||||
|
def insert_item(self, item):
|
||||||
|
self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
||||||
|
|
||||||
|
def _create_tables(self):
|
||||||
|
# 创建 sis001 数据表
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
plate_name TEXT,
|
||||||
|
title TEXT,
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
size_text TEXT,
|
||||||
|
size_gb REAL,
|
||||||
|
update_date TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||||
|
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
|
||||||
class IAFDQuery(SQLiteDBHandler):
|
@register_handler(comm.SPIDER_NAME_U3C3)
|
||||||
|
class U3C3DBHandler(SQLiteDBHandler):
|
||||||
|
def __init__(self, db_path=default_dbpath):
|
||||||
|
super().__init__(db_path)
|
||||||
|
self.tbl_name_u3c3 = 'sis'
|
||||||
|
|
||||||
|
def insert_item(self, item):
|
||||||
|
self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
||||||
|
|
||||||
|
def _create_tables(self):
|
||||||
|
# 创建 u001 数据表
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
category TEXT,
|
||||||
|
title TEXT,
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
torrent_url TEXT,
|
||||||
|
magnet_url TEXT,
|
||||||
|
size_text TEXT,
|
||||||
|
size_gb REAL,
|
||||||
|
update_date TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||||
|
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@register_handler(comm.SPIDER_NAME_IAFD)
|
||||||
|
class IAFDDBHandler(SQLiteDBHandler):
|
||||||
def __init__(self, db_path=shared_db_path):
|
def __init__(self, db_path=shared_db_path):
|
||||||
super().__init__(db_path)
|
super().__init__(db_path)
|
||||||
self.tbl_name_performers = 'iafd_performers'
|
self.tbl_name_performers = 'iafd_performers'
|
||||||
self.tbl_name_movies = 'iafd_movies'
|
self.tbl_name_movies = 'iafd_movies'
|
||||||
self.uniq_key = 'href'
|
self.uniq_key = 'href'
|
||||||
|
|
||||||
|
def insert_item(self, item):
|
||||||
|
pass
|
||||||
|
|
||||||
# 按条件查询 href 列表
|
# 按条件查询 href 列表
|
||||||
def get_performers(self, **filters):
|
def get_performers(self, **filters):
|
||||||
try:
|
try:
|
||||||
@ -31,6 +31,13 @@ class SQLiteDBHandler:
|
|||||||
if sqlite_version < (3, 24, 0):
|
if sqlite_version < (3, 24, 0):
|
||||||
self.lower_sqlite_version = True
|
self.lower_sqlite_version = True
|
||||||
|
|
||||||
|
def _create_tables(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 接口函数,必须在各个子类中实现
|
||||||
|
def insert_item(self, item):
|
||||||
|
raise NotImplementedError("子类必须实现 insert_item 方法")
|
||||||
|
|
||||||
def get_table_columns_and_defaults(self, tbl_name):
|
def get_table_columns_and_defaults(self, tbl_name):
|
||||||
try:
|
try:
|
||||||
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
||||||
|
|||||||
@ -5,9 +5,11 @@
|
|||||||
|
|
||||||
# items.py
|
# items.py
|
||||||
import scrapy
|
import scrapy
|
||||||
|
import scrapy_proj.comm.comm_def as comm
|
||||||
|
|
||||||
# u3c3.in
|
# u3c3.in
|
||||||
class U001Item(scrapy.Item):
|
class U001Item(scrapy.Item):
|
||||||
|
item_tpye = comm.ITEM_TYPE_LIST
|
||||||
category = scrapy.Field()
|
category = scrapy.Field()
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
@ -19,6 +21,7 @@ class U001Item(scrapy.Item):
|
|||||||
|
|
||||||
# sis001.com
|
# sis001.com
|
||||||
class Sis001Item(scrapy.Item):
|
class Sis001Item(scrapy.Item):
|
||||||
|
item_tpye = comm.ITEM_TYPE_LIST
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
plate_name = scrapy.Field()
|
plate_name = scrapy.Field()
|
||||||
@ -27,6 +30,7 @@ class Sis001Item(scrapy.Item):
|
|||||||
update_date = scrapy.Field()
|
update_date = scrapy.Field()
|
||||||
|
|
||||||
class IAFDPersonItem(scrapy.Item):
|
class IAFDPersonItem(scrapy.Item):
|
||||||
|
item_tpye = comm.ITEM_TYPE_ACTOR_INDEX
|
||||||
name = scrapy.Field()
|
name = scrapy.Field()
|
||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
from_astro_list = scrapy.Field()
|
from_astro_list = scrapy.Field()
|
||||||
@ -35,6 +39,7 @@ class IAFDPersonItem(scrapy.Item):
|
|||||||
from_movie_list = scrapy.Field()
|
from_movie_list = scrapy.Field()
|
||||||
|
|
||||||
class IAFDMovieItem(scrapy.Item):
|
class IAFDMovieItem(scrapy.Item):
|
||||||
|
item_tpye = comm.ITEM_TYPE_MOVIE_INDEX
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
release_year = scrapy.Field()
|
release_year = scrapy.Field()
|
||||||
@ -43,6 +48,7 @@ class IAFDMovieItem(scrapy.Item):
|
|||||||
from_stu_list = scrapy.Field()
|
from_stu_list = scrapy.Field()
|
||||||
|
|
||||||
class IAFDPersonDetailItem(scrapy.Item):
|
class IAFDPersonDetailItem(scrapy.Item):
|
||||||
|
item_tpye = comm.ITEM_TYPE_ACTOR_DETAIL
|
||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
person = scrapy.Field()
|
person = scrapy.Field()
|
||||||
gender = scrapy.Field()
|
gender = scrapy.Field()
|
||||||
@ -67,6 +73,7 @@ class IAFDPersonDetailItem(scrapy.Item):
|
|||||||
performer_aka = scrapy.Field()
|
performer_aka = scrapy.Field()
|
||||||
|
|
||||||
class IAFDMovieDetailItem(scrapy.Item):
|
class IAFDMovieDetailItem(scrapy.Item):
|
||||||
|
item_tpye = comm.ITEM_TYPE_MOVIE_DETAIL
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
href = scrapy.Field()
|
href = scrapy.Field()
|
||||||
# 可以根据实际需求添加更多影片详情字段
|
# 可以根据实际需求添加更多影片详情字段
|
||||||
@ -9,91 +9,39 @@
|
|||||||
#class ScrapyProjPipeline:
|
#class ScrapyProjPipeline:
|
||||||
# def process_item(self, item, spider):
|
# def process_item(self, item, spider):
|
||||||
# return item
|
# return item
|
||||||
|
import json
|
||||||
|
|
||||||
import os
|
|
||||||
import sqlite3
|
|
||||||
import scrapy
|
import scrapy
|
||||||
import logging
|
|
||||||
from datetime import datetime
|
|
||||||
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
|
from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
|
||||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
|
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler
|
||||||
|
|
||||||
class SQLitePipeline(SQLiteDBHandler):
|
class SQLitePipeline():
|
||||||
def __init__(self, db_path=None):
|
def __init__(self):
|
||||||
super().__init__(db_path)
|
self.db_handlers = {}
|
||||||
self.tbl_name_u3c3 = 'u3c3'
|
|
||||||
self.tbl_name_sis = 'sis'
|
def open_spider(self, spider):
|
||||||
self._create_tables()
|
spider_name = spider.name.lower()
|
||||||
|
handler_class = spider_handler_registry.get(spider_name)
|
||||||
def _create_tables(self):
|
if not handler_class:
|
||||||
# 创建 u001 数据表
|
raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
|
||||||
self.cursor.execute(f'''
|
self.db_handlers[spider_name] = handler_class()
|
||||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
category TEXT,
|
|
||||||
title TEXT,
|
|
||||||
url TEXT UNIQUE,
|
|
||||||
torrent_url TEXT,
|
|
||||||
magnet_url TEXT,
|
|
||||||
size_text TEXT,
|
|
||||||
size_gb REAL,
|
|
||||||
update_date TEXT,
|
|
||||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
|
||||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
|
||||||
)
|
|
||||||
''')
|
|
||||||
|
|
||||||
# 创建 sis001 数据表
|
|
||||||
self.cursor.execute(f'''
|
|
||||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
plate_name TEXT,
|
|
||||||
title TEXT,
|
|
||||||
url TEXT UNIQUE,
|
|
||||||
size_text TEXT,
|
|
||||||
size_gb REAL,
|
|
||||||
update_date TEXT,
|
|
||||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
|
||||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
|
||||||
)
|
|
||||||
''')
|
|
||||||
self.conn.commit()
|
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
|
||||||
if isinstance(item, U001Item):
|
|
||||||
self._process_u001_item(item, spider)
|
|
||||||
elif isinstance(item, Sis001Item):
|
|
||||||
self._process_sis001_item(item, spider)
|
|
||||||
elif isinstance(item, IAFDPersonItem):
|
|
||||||
self._process_iafd_person_item(item, spider)
|
|
||||||
elif isinstance(item, IAFDPersonDetailItem):
|
|
||||||
self._process_iafd_person_detail_item(item, spider)
|
|
||||||
elif isinstance(item, IAFDMovieItem):
|
|
||||||
self._process_iafd_movie_item(item, spider)
|
|
||||||
elif isinstance(item, IAFDMovieDetailItem):
|
|
||||||
self._process_iafd_movie_detail_item(item, spider)
|
|
||||||
return item
|
|
||||||
|
|
||||||
def _process_u001_item(self, item, spider):
|
|
||||||
spider.logger.debug(f"insert one item. href:{spider.name}")
|
|
||||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
|
||||||
|
|
||||||
def _process_sis001_item(self, item, spider):
|
|
||||||
spider.logger.debug(f"insert one item. href:{spider.name}")
|
|
||||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
|
||||||
|
|
||||||
def _process_iafd_person_item(self, item, spider):
|
|
||||||
spider.logger.debug(f"deal with persion item. {item}")
|
|
||||||
|
|
||||||
def _process_iafd_movie_item(self, item, spider):
|
|
||||||
spider.logger.debug(f"deal with movie item. {item}")
|
|
||||||
|
|
||||||
def _process_iafd_person_detail_item(self, item, spider):
|
|
||||||
spider.logger.debug(f"deal with persion item. {item}")
|
|
||||||
|
|
||||||
def _process_iafd_movie_detail_item(self, item, spider):
|
|
||||||
spider.logger.debug(f"deal with movie item. {item}")
|
|
||||||
|
|
||||||
def close_spider(self, spider):
|
def close_spider(self, spider):
|
||||||
self.conn.close()
|
spider_name = spider.name.lower()
|
||||||
|
handler = self.db_handlers.pop(spider_name, None)
|
||||||
|
if handler:
|
||||||
|
handler.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
spider_name = spider.name.lower()
|
||||||
|
handler = self.db_handlers.get(spider_name)
|
||||||
|
|
||||||
|
if not handler:
|
||||||
|
raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
|
||||||
|
|
||||||
|
# 转换为单行JSON格式
|
||||||
|
item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
|
||||||
|
spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
|
||||||
|
|
||||||
|
handler.insert_item(item)
|
||||||
|
|
||||||
|
|||||||
@ -2,12 +2,13 @@ import scrapy
|
|||||||
import re
|
import re
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||||
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
|
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
||||||
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
|
||||||
|
|
||||||
db_tools = IAFDQuery()
|
db_tools = IAFDDBHandler()
|
||||||
|
|
||||||
class IAFDSpider(BaseSpider):
|
class IAFDSpider(BaseSpider):
|
||||||
name = "iafd"
|
name = SPIDER_NAME_IAFD
|
||||||
allowed_domains = ["iafd.com"]
|
allowed_domains = ["iafd.com"]
|
||||||
|
|
||||||
host_url = "https://www.iafd.com"
|
host_url = "https://www.iafd.com"
|
||||||
|
|||||||
@ -4,9 +4,10 @@ from urllib.parse import urljoin
|
|||||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
from scrapy_proj.items import Sis001Item
|
from scrapy_proj.items import Sis001Item
|
||||||
from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime
|
from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime
|
||||||
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_SIS
|
||||||
|
|
||||||
class Sis001Spider(BaseSpider):
|
class Sis001Spider(BaseSpider):
|
||||||
name = "sis"
|
name = SPIDER_NAME_SIS
|
||||||
allowed_domains = ["sis001.com"]
|
allowed_domains = ["sis001.com"]
|
||||||
|
|
||||||
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
||||||
@ -111,9 +112,9 @@ class Sis001Spider(BaseSpider):
|
|||||||
|
|
||||||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||||
up_date = parse_date_to_datetime(item['update_date'])
|
up_date = parse_date_to_datetime(item['update_date'])
|
||||||
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||||
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) :
|
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) :
|
||||||
self.logger.debug(f"find early data.")
|
pass
|
||||||
else:
|
else:
|
||||||
need_next = True
|
need_next = True
|
||||||
|
|
||||||
|
|||||||
@ -3,9 +3,10 @@ import scrapy
|
|||||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
from scrapy_proj.items import U001Item
|
from scrapy_proj.items import U001Item
|
||||||
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
||||||
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3
|
||||||
|
|
||||||
class U001Spider(BaseSpider):
|
class U001Spider(BaseSpider):
|
||||||
name = "u3c3"
|
name = SPIDER_NAME_U3C3
|
||||||
allowed_domains = ["u001.25img.com"]
|
allowed_domains = ["u001.25img.com"]
|
||||||
start_urls = ["https://u001.25img.com/?p=1"]
|
start_urls = ["https://u001.25img.com/?p=1"]
|
||||||
|
|
||||||
@ -37,9 +38,9 @@ class U001Spider(BaseSpider):
|
|||||||
|
|
||||||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||||||
up_date = parse_date_to_datetime(item['update_date'])
|
up_date = parse_date_to_datetime(item['update_date'])
|
||||||
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||||
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
||||||
self.logger.debug(f"find early data.")
|
pass
|
||||||
else:
|
else:
|
||||||
need_next = True
|
need_next = True
|
||||||
yield item
|
yield item
|
||||||
|
|||||||
Reference in New Issue
Block a user