dify 上有文件数量限制,使用程序处理合并一下再上传向量化
import os
def merge_txt_files(folder_path, output_file):
"""
合并指定文件夹中的所有txt文件
:param folder_path: 包含txt文件的文件夹路径
:param output_file: 输出的合并文件路径
"""
# 获取文件夹中的所有txt文件并排序
files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
files.sort()
# 打开输出文件
with open(output_file, 'w', encoding='utf-8') as outfile:
# 遍历所有txt文件
for file in files:
file_path = os.path.join(folder_path, file)
# 写入文件名作为标题
outfile.write('\\n\\n' + '='*50 + '\\n')
outfile.write(f'文件名:{file}\\n')
outfile.write('='*50 + '\\n\\n')
# 读取并写入文件内容
try:
with open(file_path, 'r', encoding='utf-8') as infile:
content = infile.read()
outfile.write(content)
outfile.write('\\n')
except Exception as e:
print(f"处理文件 {file} 时出错: {str(e)}")
if __name__ == "__main__":
folder_path = "/Users/junwu/Downloads/doc_all_txt/group_023"
output_file = "/Users/junwu/Downloads/doc_all_txt/group_023_merged.txt"
merge_txt_files(folder_path, output_file)
print("文件合并完成!")
import os
import docx2txt
import math
def convert_to_txt(input_path, output_path):
"""将Word文件转换为txt格式,并按每20个文件一组放入子文件夹"""
# 确保输出目录存在
os.makedirs(output_path, exist_ok=True)
# 用于记录已处理的文件数量
processed_files = []
# 遍历输入目录中的所有文件和文件夹
for root, dirs, files in os.walk(input_path):
for file in files:
# 只处理 .doc 和 .docx 文件
if not file.lower().endswith(('.doc', '.docx')):
continue
input_file = os.path.join(root, file)
# 获取文件名(不含扩展名)
name = os.path.splitext(file)[0]
# 获取相对路径中的年份信息
rel_path = os.path.relpath(root, input_path)
year_info = ""
if "演讲实录" in rel_path:
year_info = f"[{rel_path.split()[0]}]"
# 新的txt文件名(添加年份信息)
new_name = f"{year_info}{name}.txt" if year_info else f"{name}.txt"
try:
# 转换Word文档
text = docx2txt.process(input_file)
# 先保存文件信息,稍后再写入
processed_files.append((new_name, text))
print(f"已转换: {file} -> {new_name}")
except Exception as e:
print(f"转换 {file} 时出错: {str(e)}")
# 计算需要创建的文件夹数量
total_files = len(processed_files)
num_folders = math.ceil(total_files / 20)
# 将文件分组写入对应的文件夹
for i in range(num_folders):
# 创建子文件夹
folder_name = f"group_{i+1:03d}"
folder_path = os.path.join(output_path, folder_name)
os.makedirs(folder_path, exist_ok=True)
# 获取当前组的文件(每组20个)
start_idx = i * 20
end_idx = min((i + 1) * 20, total_files)
current_group = processed_files[start_idx:end_idx]
# 写入文件
for filename, content in current_group:
output_file = os.path.join(folder_path, filename)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f"已创建文件夹 {folder_name},包含 {len(current_group)} 个文件")
if __name__ == "__main__":
input_dir = "/Users/junwu/Downloads/doc"
output_dir = "/Users/junwu/Downloads/doc_all_txt"
convert_to_txt(input_dir, output_dir)
print("转换完成!")