67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
import requests
|
|
import re
|
|
import os
|
|
import json
|
|
import time
|
|
import csv
|
|
import logging
|
|
from datetime import datetime
|
|
import config
|
|
|
|
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
|
|
def extract_create_time(input_str):
|
|
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
|
|
match = re.search(pattern, input_str)
|
|
if match:
|
|
datetime_str = match.group(0)
|
|
return datetime_str
|
|
else:
|
|
return input_str
|
|
|
|
# 从 "read-374864.html" 中获取数字编号
|
|
def extract_page_num(page_str, default_num = 0):
|
|
# 定义正则表达式模式
|
|
pattern = r'read-(\d+)\.html'
|
|
# 使用 re.search 查找匹配项
|
|
match = re.search(pattern, page_str)
|
|
if match:
|
|
number = match.group(1)
|
|
return number
|
|
else:
|
|
return default_num
|
|
|
|
# 从 "book-5549.html" 中获取数字编号
|
|
def extract_book_num(page_str, default_num = 0):
|
|
# 定义正则表达式模式
|
|
pattern = r'book-(\d+)\.html'
|
|
# 使用 re.search 查找匹配项
|
|
match = re.search(pattern, page_str)
|
|
if match:
|
|
number = match.group(1)
|
|
return number
|
|
else:
|
|
return default_num
|
|
|
|
# 目录页,获取更新时间和字数
|
|
def extract_chapter_uptime_words(input_str):
|
|
# 定义正则表达式模式
|
|
words_pattern = r'字数:(\d+)'
|
|
words_match = re.search(words_pattern, input_str)
|
|
words = words_match.group(1) if words_match else 0
|
|
|
|
update_time_pattern = r'更新时间:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
|
|
update_time_match = re.search(update_time_pattern, input_str)
|
|
update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
return words, update_time
|
|
|
|
# 处理 [都市] 的方括号
|
|
def remove_brackets_regex(input_str):
|
|
pattern = r'\[(.*?)\]'
|
|
match = re.match(pattern, input_str)
|
|
if match:
|
|
return match.group(1)
|
|
return input_str
|
|
|
|
|