600 多个文本文件合并

dify 上有文件数量限制，使用程序处理合并一下再上传向量化

import os

def merge_txt_files(folder_path, output_file):
    """
    合并指定文件夹中的所有txt文件
    :param folder_path: 包含txt文件的文件夹路径
    :param output_file: 输出的合并文件路径
    """
    # 获取文件夹中的所有txt文件并排序
    files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    files.sort()
    
    # 打开输出文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        # 遍历所有txt文件
        for file in files:
            file_path = os.path.join(folder_path, file)
            
            # 写入文件名作为标题
            outfile.write('\\n\\n' + '='*50 + '\\n')
            outfile.write(f'文件名：{file}\\n')
            outfile.write('='*50 + '\\n\\n')
            
            # 读取并写入文件内容
            try:
                with open(file_path, 'r', encoding='utf-8') as infile:
                    content = infile.read()
                    outfile.write(content)
                    outfile.write('\\n')
            except Exception as e:
                print(f"处理文件 {file} 时出错: {str(e)}")

if __name__ == "__main__":
    folder_path = "/Users/junwu/Downloads/doc_all_txt/group_023"
    output_file = "/Users/junwu/Downloads/doc_all_txt/group_023_merged.txt"
    
    merge_txt_files(folder_path, output_file)
    print("文件合并完成！")

import os
import docx2txt
import math

def convert_to_txt(input_path, output_path):
    """将Word文件转换为txt格式，并按每20个文件一组放入子文件夹"""
    
    # 确保输出目录存在
    os.makedirs(output_path, exist_ok=True)
    
    # 用于记录已处理的文件数量
    processed_files = []
    
    # 遍历输入目录中的所有文件和文件夹
    for root, dirs, files in os.walk(input_path):
        for file in files:
            # 只处理 .doc 和 .docx 文件
            if not file.lower().endswith(('.doc', '.docx')):
                continue
                
            input_file = os.path.join(root, file)
            
            # 获取文件名（不含扩展名）
            name = os.path.splitext(file)[0]
            
            # 获取相对路径中的年份信息
            rel_path = os.path.relpath(root, input_path)
            year_info = ""
            if "演讲实录" in rel_path:
                year_info = f"[{rel_path.split()[0]}]"
            
            # 新的txt文件名（添加年份信息）
            new_name = f"{year_info}{name}.txt" if year_info else f"{name}.txt"
            
            try:
                # 转换Word文档
                text = docx2txt.process(input_file)
                # 先保存文件信息，稍后再写入
                processed_files.append((new_name, text))
                print(f"已转换: {file} -> {new_name}")
                
            except Exception as e:
                print(f"转换 {file} 时出错: {str(e)}")
    
    # 计算需要创建的文件夹数量
    total_files = len(processed_files)
    num_folders = math.ceil(total_files / 20)
    
    # 将文件分组写入对应的文件夹
    for i in range(num_folders):
        # 创建子文件夹
        folder_name = f"group_{i+1:03d}"
        folder_path = os.path.join(output_path, folder_name)
        os.makedirs(folder_path, exist_ok=True)
        
        # 获取当前组的文件（每组20个）
        start_idx = i * 20
        end_idx = min((i + 1) * 20, total_files)
        current_group = processed_files[start_idx:end_idx]
        
        # 写入文件
        for filename, content in current_group:
            output_file = os.path.join(folder_path, filename)
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(content)
        
        print(f"已创建文件夹 {folder_name}，包含 {len(current_group)} 个文件")

if __name__ == "__main__":
    input_dir = "/Users/junwu/Downloads/doc"
    output_dir = "/Users/junwu/Downloads/doc_all_txt"
    
    convert_to_txt(input_dir, output_dir)
    print("转换完成！")