|
|
@@ -1,36 +1,31 @@
|
|
|
-package top.zhixinghe1.money;
|
|
|
+package top.zhixinghe1.money.agg;
|
|
|
|
|
|
import me.tongfei.progressbar.ProgressBar;
|
|
|
import me.tongfei.progressbar.ProgressBarBuilder;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
+import top.zhixinghe1.money.agg.entity.CalResult;
|
|
|
+import top.zhixinghe1.money.agg.entity.CalRunable;
|
|
|
+import top.zhixinghe1.money.agg.entity.CalTask;
|
|
|
+import top.zhixinghe1.money.agg.entity.Word;
|
|
|
|
|
|
-import java.io.BufferedInputStream;
|
|
|
-import java.io.BufferedOutputStream;
|
|
|
import java.io.BufferedReader;
|
|
|
import java.io.BufferedWriter;
|
|
|
import java.io.File;
|
|
|
-import java.io.FileInputStream;
|
|
|
-import java.io.FileOutputStream;
|
|
|
import java.io.FileReader;
|
|
|
import java.io.FileWriter;
|
|
|
-import java.io.IOException;
|
|
|
import java.text.SimpleDateFormat;
|
|
|
import java.time.temporal.ChronoUnit;
|
|
|
import java.util.ArrayList;
|
|
|
import java.util.Arrays;
|
|
|
import java.util.BitSet;
|
|
|
-import java.util.Date;
|
|
|
import java.util.HashMap;
|
|
|
-import java.util.HashSet;
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
-import java.util.Objects;
|
|
|
import java.util.Set;
|
|
|
import java.util.concurrent.CopyOnWriteArraySet;
|
|
|
import java.util.concurrent.ExecutorService;
|
|
|
import java.util.concurrent.Executors;
|
|
|
import java.util.concurrent.LinkedBlockingQueue;
|
|
|
-import java.util.concurrent.TimeUnit;
|
|
|
import java.util.function.Function;
|
|
|
import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
@@ -39,7 +34,7 @@ import java.util.stream.Collectors;
|
|
|
/**
|
|
|
* 文本聚合 程序
|
|
|
*/
|
|
|
-public class AggApplication {
|
|
|
+public class Agg {
|
|
|
|
|
|
private static final int perTaskNum = 10000;
|
|
|
|
|
|
@@ -49,49 +44,24 @@ public class AggApplication {
|
|
|
|
|
|
private static BitSet bitmap = null;
|
|
|
|
|
|
- private static final Pattern aggFilePattern = Pattern.compile("长尾词_合并_聚合_\\d+_\\d+.txt");
|
|
|
-
|
|
|
- public static void main(String[] args) throws IOException, InterruptedException {
|
|
|
-
|
|
|
-// String dataDirPath = args[0];
|
|
|
- String dataDirPath = "E:\\ChenYL\\CodeRepository\\money-mining-python\\data\\test";
|
|
|
+ public void process(String dataDirPath) throws Exception {
|
|
|
if (StringUtils.isBlank(dataDirPath)) {
|
|
|
- System.out.println("没有输入目标数据路径");
|
|
|
- return;
|
|
|
+ throw new Exception("没有输入目标数据路径");
|
|
|
}
|
|
|
|
|
|
// 判断传入路径是否有效
|
|
|
File dataDir = new File(dataDirPath);
|
|
|
if (!dataDir.exists() || !dataDir.isDirectory()) {
|
|
|
- System.out.println(String.format("数据目录路径不存在,%s", dataDirPath));
|
|
|
- return;
|
|
|
+ throw new Exception(String.format("数据目录路径不存在,%s", dataDirPath));
|
|
|
}
|
|
|
|
|
|
// 判断关键资源文件是否存在
|
|
|
- List<String> fileNameList = Arrays.asList("长尾词_合并_分词.txt", "长尾词_合并.txt", "长尾词_合并_倒排索引.txt");
|
|
|
+ List<String> fileNameList = Arrays.asList("长尾词.txt", "长尾词_分词.txt", "长尾词_倒排索引.txt");
|
|
|
for (String fileName : fileNameList) {
|
|
|
String resFilePath = String.join(File.separator, dataDirPath, fileName);
|
|
|
File resfile = new File(resFilePath);
|
|
|
if (!resfile.exists() || !resfile.isFile()) {
|
|
|
- System.out.println(String.format("文件不存在!文件路径:%s", resFilePath));
|
|
|
- return;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // 归档历史数据文件
|
|
|
- File[] files = dataDir.listFiles();
|
|
|
- List<File> historyAggFile = Arrays.stream(files).filter(file -> {
|
|
|
- Matcher matcher = aggFilePattern.matcher(file.getName());
|
|
|
- return matcher.find();
|
|
|
- }).collect(Collectors.toList());
|
|
|
- if (Objects.nonNull(historyAggFile) || historyAggFile.size() > 0) {
|
|
|
- SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
|
|
|
- String archivePath = String.join(File.separator, dataDirPath, String.format("长尾词_聚合_归档_%s", sdf.format(new Date())));
|
|
|
- File archiveDir = new File(archivePath);
|
|
|
- archiveDir.mkdirs();
|
|
|
- for (File historyFile : historyAggFile) {
|
|
|
- String destPath = String.join(File.separator, archivePath, historyFile.getName());
|
|
|
- historyFile.renameTo(new File(destPath));
|
|
|
+ throw new Exception(String.format("文件不存在!文件路径:%s", resFilePath));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -100,7 +70,7 @@ public class AggApplication {
|
|
|
Pattern pattern = Pattern.compile("([^,]+)");
|
|
|
|
|
|
// 构造关键词缓存
|
|
|
- try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_分词.txt"));
|
|
|
+ try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_分词.txt"));
|
|
|
BufferedReader br = new BufferedReader(reader)) {
|
|
|
String line = null;
|
|
|
while ((line = br.readLine()) != null) {
|
|
|
@@ -136,7 +106,7 @@ public class AggApplication {
|
|
|
}
|
|
|
|
|
|
// 构建倒排索引缓存
|
|
|
- try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_倒排索引.txt"));
|
|
|
+ try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_倒排索引.txt"));
|
|
|
BufferedReader br = new BufferedReader(reader)) {
|
|
|
String line = null;
|
|
|
while ((line = br.readLine()) != null) {
|
|
|
@@ -178,11 +148,11 @@ public class AggApplication {
|
|
|
// 提交任务
|
|
|
ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
|
|
|
for (CalTask calTask : calTasks) {
|
|
|
- executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), dataDirPath, wordCache, indexCache, bitmap, queue));
|
|
|
+ executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), wordCache, indexCache, bitmap, queue));
|
|
|
}
|
|
|
|
|
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
|
|
|
- String aggFilePath = String.join(File.separator, dataDirPath, String.format("长尾词_合并_聚合_%s.txt", sdf.format(new Date())));
|
|
|
+ String aggFilePath = String.join(File.separator, dataDirPath, "长尾词_聚合结果_临时.txt");
|
|
|
ProgressBarBuilder progressBarBuilder = new ProgressBarBuilder().setTaskName("文本聚合计算")
|
|
|
.setInitialMax(totalWord)
|
|
|
.setUnit("个", 1)
|
|
|
@@ -210,12 +180,12 @@ public class AggApplication {
|
|
|
pb.step();
|
|
|
|
|
|
if (taskNum == currentTaskProgress) {
|
|
|
- break;
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- executorService.awaitTermination(1, TimeUnit.MINUTES);
|
|
|
+ executorService.shutdown();
|
|
|
System.out.println("聚合任务执行完成");
|
|
|
}
|
|
|
|