aigrammar/tools/puzzle.py

'''
词库来自: https://diginoodles.com/projects/eowl
'''
import os
import json
import random
import time
import logging
import argparse
from collections import defaultdict
from pathlib import Path
from openai import AzureOpenAI

endpoint = "https://grammar.openai.azure.com/"
model_name = "gpt-4o"
deployment = "gpt4"

subscription_key = "8b68c235b737488ab9a99983a14f8cca"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    handlers=[logging.StreamHandler()]
)

BASE_DIR = './data'
WORDS_DIR = f"{BASE_DIR}/EOWL-v1.1.2/LF Delimited Format"
RESULT_DIR = f"{BASE_DIR}/result"
os.makedirs(RESULT_DIR, exist_ok=True)
TEMP_FILE = f"{BASE_DIR}/temp_words.txt"

batch_words_size = 100

def find_words_files(folder):
    txt_files = []
    for f in Path(folder).glob("*.txt"):
        if "Words" in f.name:
            txt_files.append(f)
    return txt_files


def collect_words(files):
    words_set = set()
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if len(word) >= 3:
                    words_set.add(word)
    return list(words_set)


def write_temp(words):
    with open(TEMP_FILE, 'w', encoding='utf-8') as f:
        for word in words:
            f.write(word + '\n')


def read_batches(batch_size=batch_words_size):
    with open(TEMP_FILE, 'r', encoding='utf-8') as f:
        words = [line.strip() for line in f if line.strip()]
    for i in range(0, len(words), batch_size):
        yield words[i:i+batch_size]

'''Please respond with pure JSON only, without any formatting or explanations.'''
def build_prompt(words):
    word_list = ", ".join(words)
    prompt = f"""
Please analyze the following list of English words and do the following:

1. Classify each word into a theme (like Animals, Plants, Materials, Body Parts, Clothes & Accessories, Food & Drinks, Places, Transportation, Sports, Colors, Numbers, Emotions, Tools, People & Occupations, etc.).
2. Identify the part of speech of each word (verb, noun, adjective, etc.).
3. Mark the frequency of usage of each word in everyday English as High, Medium, or Low.
4. Identify words with the same word root and group them.

For each word, return a JSON array where each item is an object with these keys:
- w: the word
- t: theme (like Animals, Tools, etc.)
- p: part of speech (noun, verb, etc.)
- f: frequency (Low/Medium/High)
- s: same root group (array of words with the same root)

Respond with PURE JSON ONLY, without markdown or explanations.

Here are the words:
{word_list}
"""
    return prompt


def call_openai_with_retry(prompt, retries=3, delay=5):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "You are an expert English linguist and lexicographer."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=16000,
                temperature=0.7,
                top_p=1.0,
                model=deployment
            )
            #return response.choices[0].message.content.strip()
            text = response.choices[0].message.content.strip()
            # 如果还有 ```json 开头的，去掉
            if text.startswith("```json"):
                text = text[7:-3].strip()
            return text
        except Exception as e:
            logging.warning(f"OpenAI request failed (attempt {attempt+1}): {e}")
            time.sleep(delay)
    logging.error("OpenAI request failed after all retries.")
    return None


def save_result(index, req, resp, is_json):
    matched = True if is_json and len(req) == len(resp) else False
    flag = "json" if is_json else "txt"
    match_str = "matched" if matched else 'notmatch'
    filename = f"{RESULT_DIR}/{str(index).zfill(5)}_{match_str}_{flag}.json"
    data = {
        'req_len': len(req),
        'rsp_len': len(resp) if is_json else 0,
        'match':matched,
        'req': req,
        'rsp': resp
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    logging.info(f"Saved result to {filename}")


def process_folder(folder):
    files = find_words_files(folder)
    logging.info(f"Found {len(files)} files to process.")
    words = collect_words(files)
    logging.info(f"Collected {len(words)} unique words.")
    write_temp(words)

    for idx, batch in enumerate(read_batches(), 1):
        logging.info(f"Processing batch {idx} with {len(batch)} words")
        prompt = build_prompt(batch)
        resp_text = call_openai_with_retry(prompt)

        if resp_text is None:
            save_result(idx, batch, "Failed to get response", False)
            continue

        try:
            resp_json = json.loads(resp_text)
            save_result(idx, batch, resp_json, True)
        except json.JSONDecodeError:
            logging.warning(f"Batch {idx} response is not valid JSON.")
            save_result(idx, batch, resp_text, False)

        time.sleep(2)  # 每批之间暂停

# redo逻辑
def redo_results():
    files = sorted(Path(RESULT_DIR).glob('*.json'))
    for f in files:
        if 'matched' in f.name:
            continue

        logging.info(f"Redoing {f}")
        try:
            with open(f, 'r', encoding='utf-8') as fp:
                data = json.load(fp)
            words = data.get("req")
            if not words:
                logging.warning(f"No req in {f}")
                continue

            prompt = build_prompt(words)
            resp_text = call_openai_with_retry(prompt)
            if resp_text is None:
                logging.warning(f"Failed to get response: {f}")
                continue

            try:
                resp_json = json.loads(resp_text)
                if len(words) == len(resp_json):
                    logging.info(f"get correct response. rewrite file. {f}")
                    f.unlink()
                    save_result(int(f.name[:5]), words, resp_json, True)
                else:
                    logging.warning(f"response not complete: {f}, req len: {len(words)}, rsp len: {len(resp_json)}")
            except json.JSONDecodeError:
                logging.warning(f"response is not valid JSON: {f}")

            time.sleep(2)  # 每批之间暂停

        except Exception as e:
            logging.error(f"Error processing {f}: {e}")


# 检测是否无重复字母
def has_no_repeated_letters(word):
    return len(set(word)) == len(word)

def generate_wordlist():
    """
    从 RESULT_DIR 下的 matched 文件中提取无重复字母的单词，并按 f 分类写入 words_{f}.txt
    """
    word_map = defaultdict(list)
    all_words = set()

    # 优化写法：先筛选再排序
    matched_files = []
    for file in os.scandir(RESULT_DIR):
        # 同上的过滤条件
        if (file.is_file()
            and file.name.endswith('.json')
            and 'matched' in file.name
            and len(file.name) >= 5
            and file.name[:5].isdigit()):
            matched_files.append(file)

    for file in sorted(matched_files, key=lambda f: int(f.name[:5])):
        if 'matched' not in file.name:
            continue

        with open(file.path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        rsp = data.get('rsp', [])
        for item in rsp:
            word = item.get('w')
            freq = item.get('f')
            if word and freq and has_no_repeated_letters(word):
                word_map[freq].append(word)
                all_words.add(word)

    # 写入文件
    for freq, words in word_map.items():
        filename = os.path.join(RESULT_DIR, f'words_{freq}.txt')
        with open(filename, 'w', encoding='utf-8') as f:
            for word in words:
                f.write(word + '\n')
        logging.info(f'✅ 写入完成: {filename} ({len(words)} 个单词)')

    # 写全量
    filename = os.path.join(RESULT_DIR, 'wordlist.txt')
    with open(filename, 'w', encoding='utf-8') as f:
        for word in all_words:
            f.write(word + '\n')
    logging.info(f'✅ 写入完成: {filename} ({len(all_words)} 个单词)')

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('cmd', help='执行的命令: init / redo / gen')
    args = parser.parse_args()

    if args.cmd == 'init':
        process_folder(WORDS_DIR)
    elif args.cmd == 'redo':
        redo_results()
    elif args.cmd == 'gen':
        generate_wordlist()
    else:
        print("❌ 未知命令，请使用: all / redo / gen")

if __name__ == '__main__':
    main()