|
|
@@ -1,19 +1,31 @@
|
|
|
package top.zhixinghe1.money;
|
|
|
|
|
|
-import redis.clients.jedis.Jedis;
|
|
|
-import redis.clients.jedis.JedisPool;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
|
+import java.io.BufferedInputStream;
|
|
|
+import java.io.BufferedOutputStream;
|
|
|
import java.io.BufferedReader;
|
|
|
import java.io.File;
|
|
|
-import java.io.FileNotFoundException;
|
|
|
+import java.io.FileInputStream;
|
|
|
+import java.io.FileOutputStream;
|
|
|
import java.io.FileReader;
|
|
|
import java.io.IOException;
|
|
|
-import java.io.Reader;
|
|
|
import java.text.SimpleDateFormat;
|
|
|
+import java.util.ArrayList;
|
|
|
import java.util.Arrays;
|
|
|
+import java.util.BitSet;
|
|
|
import java.util.Date;
|
|
|
+import java.util.HashMap;
|
|
|
+import java.util.HashSet;
|
|
|
import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
import java.util.Objects;
|
|
|
+import java.util.Set;
|
|
|
+import java.util.concurrent.CopyOnWriteArraySet;
|
|
|
+import java.util.concurrent.ExecutorService;
|
|
|
+import java.util.concurrent.Executors;
|
|
|
+import java.util.concurrent.TimeUnit;
|
|
|
+import java.util.function.Function;
|
|
|
import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
import java.util.stream.Collectors;
|
|
|
@@ -23,66 +35,179 @@ import java.util.stream.Collectors;
|
|
|
*/
|
|
|
public class AggApplication {
|
|
|
|
|
|
-// public static void main(String[] args) {
|
|
|
-//// String dataDirPath = "D:\\Documents\\ChenYL\\CodeRepository\\money-mining-python\\data";
|
|
|
-// String dataDirPath = "D:\\Documents\\ChenYL\\CodeRepository\\money-mining-python\\data\\test";
|
|
|
-//
|
|
|
-// // 判断传入路径是否有效
|
|
|
-// File dataDir = new File(dataDirPath);
|
|
|
-// if (!dataDir.exists() || !dataDir.isDirectory()) {
|
|
|
-// System.out.println(String.format("数据目录路径不存在,%s", dataDirPath));
|
|
|
-// return;
|
|
|
-// }
|
|
|
-//
|
|
|
-// // 判断关键资源文件是否存在
|
|
|
-// List<String> fileNameList = Arrays.asList("长尾词_合并_分词.txt", "长尾词_合并_聚合.txt", "长尾词_合并.txt", "长尾词_合并_倒排索引.txt");
|
|
|
-// for (String fileName : fileNameList) {
|
|
|
-// String resFilePath = String.join(File.separator, dataDirPath, fileName);
|
|
|
-// File resfile = new File(resFilePath);
|
|
|
-// if (!resfile.exists() || !resfile.isFile()) {
|
|
|
-// System.out.println(String.format("文件不存在!文件路径:%s", resFilePath));
|
|
|
-// return;
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// // 归档历史数据文件
|
|
|
-// File[] files = dataDir.listFiles();
|
|
|
-// Pattern aggFilePattern = Pattern.compile("长尾词_合并_聚合_\\d+_\\d+.txt");
|
|
|
-// List<File> historyAggFile = Arrays.stream(files).filter(file -> {
|
|
|
-// Matcher matcher = aggFilePattern.matcher(file.getName());
|
|
|
-// return matcher.find();
|
|
|
-// }).collect(Collectors.toList());
|
|
|
-// if (Objects.nonNull(historyAggFile) || historyAggFile.size() > 0) {
|
|
|
-// SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
|
|
|
-// String archivePath = String.join(File.separator, dataDirPath, String.format("长尾词_聚合_归档_%s", sdf.format(new Date())));
|
|
|
-// File archiveDir = new File(archivePath);
|
|
|
-// archiveDir.mkdirs();
|
|
|
-// for (File historyFile : historyAggFile) {
|
|
|
-// String destPath = String.join(File.separator, archivePath, historyFile.getName());
|
|
|
-// System.out.println(destPath);
|
|
|
-// historyFile.renameTo(new File(destPath));
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// JedisPool pool = new JedisPool("127.0.0.1", 6379);
|
|
|
-// try (Jedis jedis = pool.getResource()) {
|
|
|
-// try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_分词.txt"));
|
|
|
-// BufferedReader br = new BufferedReader(reader)) {
|
|
|
-// String line = null;
|
|
|
-// while ((line = br.readLine()) != null) {
|
|
|
-// System.out.println(line);
|
|
|
-// System.out.println("暂停");
|
|
|
-// }
|
|
|
-//
|
|
|
-// } catch (IOException e) {
|
|
|
-// throw new RuntimeException(e);
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-
|
|
|
- public static void main(String[] args) throws InterruptedException {
|
|
|
- System.out.println("hello python start");
|
|
|
- Thread.sleep(5000);
|
|
|
- System.out.println("hello python end");
|
|
|
+ private static final int perTaskNum = 10000;
|
|
|
+
|
|
|
+ private static final Map<Integer, Word> wordCache = new HashMap();
|
|
|
+
|
|
|
+ private static final Map<String, Set<Integer>> indexCache = new HashMap();
|
|
|
+
|
|
|
+ private static BitSet bitmap = null;
|
|
|
+
|
|
|
+ private static final Pattern aggFilePattern = Pattern.compile("长尾词_合并_聚合_\\d+_\\d+.txt");
|
|
|
+
|
|
|
+ public static void main(String[] args) throws IOException, InterruptedException {
|
|
|
+
|
|
|
+// String dataDirPath = args[0];
|
|
|
+ String dataDirPath = "E:\\ChenYL\\CodeRepository\\money-mining-python\\data\\test";
|
|
|
+ if (StringUtils.isBlank(dataDirPath)) {
|
|
|
+ System.out.println("没有输入目标数据路径");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 判断传入路径是否有效
|
|
|
+ File dataDir = new File(dataDirPath);
|
|
|
+ if (!dataDir.exists() || !dataDir.isDirectory()) {
|
|
|
+ System.out.println(String.format("数据目录路径不存在,%s", dataDirPath));
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 判断关键资源文件是否存在
|
|
|
+ List<String> fileNameList = Arrays.asList("长尾词_合并_分词.txt", "长尾词_合并.txt", "长尾词_合并_倒排索引.txt");
|
|
|
+ for (String fileName : fileNameList) {
|
|
|
+ String resFilePath = String.join(File.separator, dataDirPath, fileName);
|
|
|
+ File resfile = new File(resFilePath);
|
|
|
+ if (!resfile.exists() || !resfile.isFile()) {
|
|
|
+ System.out.println(String.format("文件不存在!文件路径:%s", resFilePath));
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 归档历史数据文件
|
|
|
+ File[] files = dataDir.listFiles();
|
|
|
+ List<File> historyAggFile = Arrays.stream(files).filter(file -> {
|
|
|
+ Matcher matcher = aggFilePattern.matcher(file.getName());
|
|
|
+ return matcher.find();
|
|
|
+ }).collect(Collectors.toList());
|
|
|
+ if (Objects.nonNull(historyAggFile) || historyAggFile.size() > 0) {
|
|
|
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
|
|
|
+ String archivePath = String.join(File.separator, dataDirPath, String.format("长尾词_聚合_归档_%s", sdf.format(new Date())));
|
|
|
+ File archiveDir = new File(archivePath);
|
|
|
+ archiveDir.mkdirs();
|
|
|
+ for (File historyFile : historyAggFile) {
|
|
|
+ String destPath = String.join(File.separator, archivePath, historyFile.getName());
|
|
|
+ historyFile.renameTo(new File(destPath));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ int totalWord = 0;
|
|
|
+
|
|
|
+ Pattern pattern = Pattern.compile("([^,]+)");
|
|
|
+
|
|
|
+ // 构造关键词缓存
|
|
|
+ try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_分词.txt"));
|
|
|
+ BufferedReader br = new BufferedReader(reader)) {
|
|
|
+ String line = null;
|
|
|
+ while ((line = br.readLine()) != null) {
|
|
|
+ if (StringUtils.isBlank(line)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 记录总文本数
|
|
|
+ totalWord ++;
|
|
|
+
|
|
|
+ // 提取关键词和分词
|
|
|
+ Matcher matcher = pattern.matcher(line);
|
|
|
+ if (!matcher.find()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String key = matcher.group();
|
|
|
+ if (StringUtils.isBlank(key)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ List<String> stems = new ArrayList<>();
|
|
|
+ while (matcher.find()) {
|
|
|
+ String stem = matcher.group();
|
|
|
+ if (StringUtils.isBlank(stem)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ stems.add(stem);
|
|
|
+ }
|
|
|
+ Map<CharSequence, Integer> stemMap = stems.stream().collect(Collectors.toMap(Function.identity(), v -> 1, Integer::sum));
|
|
|
+
|
|
|
+ wordCache.put(totalWord, new Word(key, stemMap));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 构建倒排索引缓存
|
|
|
+ try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_倒排索引.txt"));
|
|
|
+ BufferedReader br = new BufferedReader(reader)) {
|
|
|
+ String line = null;
|
|
|
+ while ((line = br.readLine()) != null) {
|
|
|
+ if (StringUtils.isBlank(line)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取关键词和分词
|
|
|
+ Matcher matcher = pattern.matcher(line);
|
|
|
+ if (!matcher.find()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String stem = matcher.group();
|
|
|
+ if (StringUtils.isBlank(stem)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ Set<Integer> positions = new CopyOnWriteArraySet<>();
|
|
|
+ while (matcher.find()) {
|
|
|
+ String position = matcher.group();
|
|
|
+ if (StringUtils.isBlank(position)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ positions.add(Integer.valueOf(position));
|
|
|
+ }
|
|
|
+
|
|
|
+ indexCache.put(stem, positions);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 初始化已处理位图
|
|
|
+ bitmap = new BitSet(totalWord+1);
|
|
|
+
|
|
|
+ // 分割计算任务
|
|
|
+ List<CalTask> calTasks = avgSplitTask(totalWord, perTaskNum);
|
|
|
+
|
|
|
+ // 提交任务
|
|
|
+ ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
|
|
|
+// ExecutorService executorService = Executors.newFixedThreadPool(1);
|
|
|
+ for (CalTask calTask : calTasks) {
|
|
|
+ executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), dataDirPath, wordCache, indexCache, bitmap));
|
|
|
+ }
|
|
|
+
|
|
|
+ // 等待任务执行完成
|
|
|
+ executorService.awaitTermination(12, TimeUnit.HOURS);
|
|
|
+
|
|
|
+ // 合并计算结果
|
|
|
+ List<File> aggResultFiles = Arrays.stream(dataDir.listFiles()).filter(file -> aggFilePattern.matcher(file.getName()).find()).collect(Collectors.toList());
|
|
|
+ if (aggResultFiles.size() == 0) {
|
|
|
+ System.out.println("没有找到任何计算分结果,任务结束");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ String aggFilePath = String.join(File.separator, dataDirPath, "长尾词_合并_聚合.txt");
|
|
|
+ try (FileOutputStream fileOutputStream = new FileOutputStream(aggFilePath);
|
|
|
+ BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream)) {
|
|
|
+ for (File aggResultFile : aggResultFiles) {
|
|
|
+ try (FileInputStream fileInputStream = new FileInputStream(aggResultFile);
|
|
|
+ BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream)) {
|
|
|
+ bufferedOutputStream.write(bufferedInputStream.readAllBytes());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<CalTask> avgSplitTask(int total, int internal) {
|
|
|
+ // 分割的任务份数
|
|
|
+ int taskNum = (int) Math.ceil((double) total / internal);
|
|
|
+ // 平分
|
|
|
+ List<CalTask> calTasks = new ArrayList<>();
|
|
|
+ for (int i = 0; i < taskNum; i++) {
|
|
|
+ int start = i * internal + 1;
|
|
|
+ int end = i * internal + internal;
|
|
|
+ if (end > total) {
|
|
|
+ end = total;
|
|
|
+ }
|
|
|
+ calTasks.add(new CalTask(start, end));
|
|
|
+ }
|
|
|
+ return calTasks;
|
|
|
}
|
|
|
}
|