소스 검색

完善代码结构

ChenYL 2 년 전
부모
커밋
52328d714c

+ 2 - 2
pom.xml

@@ -6,7 +6,7 @@
 
     <groupId>top.zhixinghe1</groupId>
     <artifactId>money-mining</artifactId>
-    <version>1.0-SNAPSHOT</version>
+    <version>1.0</version>
 
     <properties>
         <maven.compiler.source>17</maven.compiler.source>
@@ -70,7 +70,7 @@
                             <addClasspath>true</addClasspath>
                             <classpathPrefix>lib/</classpathPrefix>
                             <!--添加项目中主类-->
-                            <mainClass>top.zhixinghe1.money.AggApplication</mainClass>
+                            <mainClass>top.zhixinghe1.money.Application</mainClass>
                         </manifest>
                     </archive>
                 </configuration>

+ 25 - 0
src/main/java/top/zhixinghe1/money/Application.java

@@ -0,0 +1,25 @@
+package top.zhixinghe1.money;
+
+import top.zhixinghe1.money.agg.Agg;
+
+/**
+ * 程序入口
+ */
+public class Application {
+
+    public static void main(String[] args) throws Exception {
+        if (args.length == 0) {
+            throw new Exception("运行Java程序前,请先输入运行参数");
+        }
+        String funcName = args[0];
+        if (!"agg".equals(funcName)) {
+            throw new Exception(String.format("输入了非法程序名:%s", funcName));
+        }
+        if ("agg".equals(funcName)) {
+            if (args.length != 2) {
+                throw new Exception("Java长尾词聚合程序,输入了非法参数");
+            }
+            new Agg().process(args[1]);
+        }
+    }
+}

+ 0 - 41
src/main/java/top/zhixinghe1/money/CalResult.java

@@ -1,41 +0,0 @@
-package top.zhixinghe1.money;
-
-import java.util.List;
-
-public class CalResult {
-
-    private boolean endStatus;
-
-    private boolean aggStatus;
-
-    private List<String> similarWords;
-
-    public CalResult(boolean aggStatus, List<String> similarWords) {
-        this.aggStatus = aggStatus;
-        this.similarWords = similarWords;
-    }
-
-    public boolean isAggStatus() {
-        return aggStatus;
-    }
-
-    public void setAggStatus(boolean aggStatus) {
-        this.aggStatus = aggStatus;
-    }
-
-    public List<String> getSimilarWords() {
-        return similarWords;
-    }
-
-    public void setSimilarWords(List<String> similarWords) {
-        this.similarWords = similarWords;
-    }
-
-    public boolean isEndStatus() {
-        return endStatus;
-    }
-
-    public void setEndStatus(boolean endStatus) {
-        this.endStatus = endStatus;
-    }
-}

+ 0 - 114
src/main/java/top/zhixinghe1/money/CalRunable.java

@@ -1,114 +0,0 @@
-package top.zhixinghe1.money;
-
-import me.tongfei.progressbar.ProgressBar;
-import org.apache.commons.text.similarity.CosineSimilarity;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-import java.util.Set;
-import java.util.concurrent.LinkedBlockingQueue;
-
-public class CalRunable implements Runnable {
-
-    private int start;
-
-    private int end;
-
-    private String dataDirPath;
-
-    private BufferedWriter writer;
-
-    private Map<Integer, Word> wordCache = new HashMap();
-
-    private Map<String, Set<Integer>> indexCache = new HashMap();
-
-    private BitSet bitmap = null;
-
-    private CosineSimilarity cosineSimilarity = new CosineSimilarity();
-
-    private Double aggThreshold = 0.8;
-
-    private LinkedBlockingQueue<CalResult> queue;
-
-    private Set<Integer> indexSet = new HashSet<>();
-    private List<String> result = new ArrayList<>();
-
-    public CalRunable(int start, int end, String dataDirPath, Map<Integer, Word> wordCache, Map<String, Set<Integer>> indexCache, BitSet bitmap, LinkedBlockingQueue<CalResult> queue) {
-        this.start = start;
-        this.end = end;
-        this.dataDirPath = dataDirPath;
-        this.wordCache = wordCache;
-        this.indexCache = indexCache;
-        this.bitmap = bitmap;
-        this.queue = queue;
-    }
-
-    @Override
-    public void run() {
-        try {
-            for (int i = start; i <= end; i++) {
-                CalResult calResult = null;
-                if (cal(i)) {
-                    calResult = new CalResult(true, new ArrayList<>(result));
-                } else {
-                    calResult = new CalResult(false, null);
-                }
-                calResult.setEndStatus(i == end);
-                queue.put(calResult);
-            }
-        } catch (Exception e) {
-            e.printStackTrace();
-        }
-    }
-
-    private boolean cal(int i) {
-        // 判断是否已进行计算
-        if (bitmap.get(i)) {
-            return false;
-        }
-
-        // 清除上一轮的数据
-        indexSet.clear();
-        result.clear();
-
-        Word word = wordCache.get(i);
-        if (Objects.isNull(word.getStemMap()) || word.getStemMap().size() == 0) {
-            return false;
-        }
-        bitmap.set(i, true);
-        result.add(word.getKey());
-        for (CharSequence stem : word.getStemMap().keySet()) {
-            Set<Integer> positions = indexCache.get(stem);
-            for (Integer position : positions) {
-                if (bitmap.get(position)) {
-                    positions.remove(position);
-                } else {
-                    indexSet.add(position);
-                }
-            }
-        }
-        for (Integer index : indexSet) {
-            Word candicateWord = wordCache.get(index);
-            if (Objects.isNull(candicateWord.getStemMap())) {
-                continue;
-            }
-            Double v = cosineSimilarity.cosineSimilarity(word.getStemMap(), candicateWord.getStemMap());
-            if (v < aggThreshold) {
-                continue;
-            }
-            result.add(candicateWord.getKey());
-        }
-        // 输出计算结果
-        return result.size() > 1;
-    }
-}

+ 0 - 34
src/main/java/top/zhixinghe1/money/CalTask.java

@@ -1,34 +0,0 @@
-package top.zhixinghe1.money;
-
-import java.io.Serial;
-import java.io.Serializable;
-
-public class CalTask implements Serializable {
-    @Serial
-    private static final long serialVersionUID = 6711062995204035815L;
-
-    private int startPos;
-
-    private int endPos;
-
-    public CalTask(int startPos, int endPos) {
-        this.startPos = startPos;
-        this.endPos = endPos;
-    }
-
-    public int getStartPos() {
-        return startPos;
-    }
-
-    public void setStartPos(int startPos) {
-        this.startPos = startPos;
-    }
-
-    public int getEndPos() {
-        return endPos;
-    }
-
-    public void setEndPos(int endPos) {
-        this.endPos = endPos;
-    }
-}

+ 0 - 117
src/main/java/top/zhixinghe1/money/Test.java

@@ -1,117 +0,0 @@
-package top.zhixinghe1.money;
-
-import me.tongfei.progressbar.ProgressBar;
-
-import java.util.Arrays;
-import java.util.BitSet;
-import java.util.List;
-
-public class Test {
-    public static void main(String[] args) throws InterruptedException {
-//       "QQ邮箱格式怎么写" "QQ邮箱格式如何写", 相似度约0.8
-//        Map<CharSequence, Integer> leftVector = Arrays.asList("QQ", "邮箱", "格式", "怎么", "写").stream().collect(Collectors.toMap(v -> v, v -> 1));
-//        Map<CharSequence, Integer> rightVector = Arrays.asList("QQ", "邮箱", "格式", "如何", "写").stream().collect(Collectors.toMap(v -> v, v -> 1));
-//        CosineSimilarity cosineSimilarity = new CosineSimilarity();
-//        long start = System.currentTimeMillis();
-//        for (int i = 0; i < 300; i++) {
-//            Double v = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
-//        }
-//        System.out.println(System.currentTimeMillis() - start);
-
-
-//        try (ProgressBar pb1 = new ProgressBar("文本聚合计算", 100)) {
-//            for (int i = 0; i < 100; i++) {
-//                Thread.sleep(1000);
-//                pb1.step();
-//            }
-//        }
-
-//        JedisPool pool = new JedisPool("127.0.0.1", 6379);
-//        try (Jedis jedis = pool.getResource()) {
-//            // Store & Retrieve a simple string
-//            jedis.set("foo", "bar");
-//            String foo = jedis.get("foo");
-//            System.out.println(); // prints bar
-//
-//            // Store & Retrieve a HashMap
-//            Map<String, String> hash = new HashMap<>();;
-//            hash.put("name", "John");
-//            hash.put("surname", "Smith");
-//            hash.put("company", "Redis");
-//            hash.put("age", "29");
-//            jedis.hset("user-session:123", hash);
-//            Map<String, String> stringStringMap = jedis.hgetAll("user-session:123");
-//            System.out.println(stringStringMap);
-//            // Prints: {name=John, surname=Smith, company=Redis, age=29}
-//
-//            // 核心:获取分词列表 进行相似度计算
-//            //长尾词获取
-//            String word = jedis.hget("word", "1");
-//            List<String> word1 = jedis.hmget("word", "1", "2");
-//            // 管道使用
-//            try (Pipeline pipelined = jedis.pipelined();) {
-//                // 分词List获取
-////                pipelined.lpush("testList", "QQ");
-////                pipelined.lpush("testList", "邮箱");
-////                pipelined.lpush("testList", "格式");
-////                pipelined.lpush("testList", "怎么");
-////                pipelined.lpush("testList", "写");
-////                pipelined.sync();
-//                List<String> testList = jedis.lrange("testList", 0L, -1L);
-//
-//                // 批量获取、list对象为空
-////                pipelined.lpush("testList2", "QQ");
-////                pipelined.lpush("testList2", "邮箱");
-////                pipelined.lpush("testList2", "格式");
-////                pipelined.lpush("testList2", "如何");
-////                pipelined.lpush("testList2", "写");
-//                pipelined.sync();
-//                pipelined.lrange("testList", 0L, -1L);
-//                pipelined.lrange("testList2", 0L, -1L);
-//                pipelined.lrange("testList3", 0L, -1L);
-//                List<Object> allResult =  pipelined.syncAndReturnAll();
-//
-//                // 倒排索引获取
-//                pipelined.sadd("testIndex1", "1", "2", "3", "4");
-//                pipelined.sadd("testIndex2", "2", "4", "5", "7");
-//                pipelined.sync();
-//                Set<String> sinter = jedis.sunion("testIndex1", "testIndex2");
-//
-//                // 删除set中指定元素
-//                long testIndex2 = jedis.srem("testIndex2", "4", "8");
-
-//                // 批量删除key
-//                Set<String> keys = jedis.keys("word*");
-//                String[] array = keys.toArray(String[]::new);
-//                jedis.del(array);
-//                System.out.println("暂停");
-//            }
-//            TODO bitmap使用
-//            jedis.setbit("testBit", 32, true);
-//            jedis.setbit("testBit", 1, true);
-//            jedis.setbit("testBit", 15, true);
-//            jedis.setbit("testBit", 27, true);
-//            List<Integer> list = Arrays.asList(1, 15, 27, 32);
-//            for (Integer p : list) {
-//                boolean testBit = jedis.getbit("testBit", p);
-//                if (testBit) {
-//                    System.out.println(String.format("redis testBit 设置成功"));
-//                } else {
-//                    System.out.println(String.format("redis testBit 设置失败"));
-//                }
-//            }
-//            BitSet bitSet = BitSet.valueOf(jedis.get("testBit").getBytes());
-//            for (Integer p : list) {
-//                boolean testBit = bitSet.get(p);
-//                if (testBit) {
-//                    System.out.println(String.format("bitset testBit 设置成功"));
-//                } else {
-//                    System.out.println(String.format("bitset testBit 设置失败"));
-//                }
-//            }
-//            System.out.println("暂停");
-//        }
-
-        System.out.println("暂停");
-    }
-}

+ 0 - 40
src/main/java/top/zhixinghe1/money/Word.java

@@ -1,40 +0,0 @@
-package top.zhixinghe1.money;
-
-import java.io.Serial;
-import java.io.Serializable;
-import java.util.List;
-import java.util.Map;
-
-/**
- * @author tyuio
- */
-public class Word implements Serializable {
-
-    @Serial
-    private static final long serialVersionUID = 888376712090774661L;
-
-    private String key;
-
-    private Map<CharSequence, Integer> stemMap;
-
-    public Word(String key, Map<CharSequence, Integer> stemMap) {
-        this.key = key;
-        this.stemMap = stemMap;
-    }
-
-    public String getKey() {
-        return key;
-    }
-
-    public void setKey(String key) {
-        this.key = key;
-    }
-
-    public Map<CharSequence, Integer> getStemMap() {
-        return stemMap;
-    }
-
-    public void setStemMap(Map<CharSequence, Integer> stemMap) {
-        this.stemMap = stemMap;
-    }
-}

+ 17 - 47
src/main/java/top/zhixinghe1/money/AggApplication.java → src/main/java/top/zhixinghe1/money/agg/Agg.java

@@ -1,36 +1,31 @@
-package top.zhixinghe1.money;
+package top.zhixinghe1.money.agg;
 
 import me.tongfei.progressbar.ProgressBar;
 import me.tongfei.progressbar.ProgressBarBuilder;
 import org.apache.commons.lang3.StringUtils;
+import top.zhixinghe1.money.agg.entity.CalResult;
+import top.zhixinghe1.money.agg.entity.CalRunable;
+import top.zhixinghe1.money.agg.entity.CalTask;
+import top.zhixinghe1.money.agg.entity.Word;
 
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.FileWriter;
-import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.BitSet;
-import java.util.Date;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.CopyOnWriteArraySet;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.TimeUnit;
 import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -39,7 +34,7 @@ import java.util.stream.Collectors;
 /**
  * 文本聚合 程序
  */
-public class AggApplication {
+public class Agg {
 
     private static final int perTaskNum = 10000;
 
@@ -49,49 +44,24 @@ public class AggApplication {
 
     private static BitSet bitmap = null;
 
-    private static final Pattern aggFilePattern = Pattern.compile("长尾词_合并_聚合_\\d+_\\d+.txt");
-
-    public static void main(String[] args) throws IOException, InterruptedException {
-
-//        String dataDirPath = args[0];
-        String dataDirPath = "E:\\ChenYL\\CodeRepository\\money-mining-python\\data\\test";
+    public void process(String dataDirPath) throws Exception {
         if (StringUtils.isBlank(dataDirPath)) {
-            System.out.println("没有输入目标数据路径");
-            return;
+            throw new Exception("没有输入目标数据路径");
         }
 
         // 判断传入路径是否有效
         File dataDir = new File(dataDirPath);
         if (!dataDir.exists() || !dataDir.isDirectory()) {
-            System.out.println(String.format("数据目录路径不存在,%s", dataDirPath));
-            return;
+            throw new Exception(String.format("数据目录路径不存在,%s", dataDirPath));
         }
 
         // 判断关键资源文件是否存在
-        List<String> fileNameList = Arrays.asList("长尾词_合并_分词.txt", "长尾词_合并.txt", "长尾词_合并_倒排索引.txt");
+        List<String> fileNameList = Arrays.asList("长尾词.txt", "长尾词_分词.txt", "长尾词_倒排索引.txt");
         for (String fileName : fileNameList) {
             String resFilePath = String.join(File.separator, dataDirPath, fileName);
             File resfile = new File(resFilePath);
             if (!resfile.exists() || !resfile.isFile()) {
-                System.out.println(String.format("文件不存在!文件路径:%s", resFilePath));
-                return;
-            }
-        }
-
-        // 归档历史数据文件
-        File[] files = dataDir.listFiles();
-        List<File> historyAggFile = Arrays.stream(files).filter(file -> {
-            Matcher matcher = aggFilePattern.matcher(file.getName());
-            return matcher.find();
-        }).collect(Collectors.toList());
-        if (Objects.nonNull(historyAggFile) || historyAggFile.size() > 0) {
-            SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
-            String archivePath = String.join(File.separator, dataDirPath, String.format("长尾词_聚合_归档_%s", sdf.format(new Date())));
-            File archiveDir = new File(archivePath);
-            archiveDir.mkdirs();
-            for (File historyFile : historyAggFile) {
-                String destPath = String.join(File.separator, archivePath, historyFile.getName());
-                historyFile.renameTo(new File(destPath));
+                throw new Exception(String.format("文件不存在!文件路径:%s", resFilePath));
             }
         }
 
@@ -100,7 +70,7 @@ public class AggApplication {
         Pattern pattern = Pattern.compile("([^,]+)");
 
         // 构造关键词缓存
-        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_分词.txt"));
+        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_分词.txt"));
              BufferedReader br = new BufferedReader(reader)) {
             String line = null;
             while ((line = br.readLine()) != null) {
@@ -136,7 +106,7 @@ public class AggApplication {
         }
 
         // 构建倒排索引缓存
-        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_合并_倒排索引.txt"));
+        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_倒排索引.txt"));
              BufferedReader br = new BufferedReader(reader)) {
             String line = null;
             while ((line = br.readLine()) != null) {
@@ -178,11 +148,11 @@ public class AggApplication {
         // 提交任务
         ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
         for (CalTask calTask : calTasks) {
-            executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), dataDirPath, wordCache, indexCache, bitmap, queue));
+            executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), wordCache, indexCache, bitmap, queue));
         }
 
         SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
-        String aggFilePath = String.join(File.separator, dataDirPath, String.format("长尾词_合并_聚合_%s.txt", sdf.format(new Date())));
+        String aggFilePath = String.join(File.separator, dataDirPath, "长尾词_聚合结果_临时.txt");
         ProgressBarBuilder progressBarBuilder = new ProgressBarBuilder().setTaskName("文本聚合计算")
                 .setInitialMax(totalWord)
                 .setUnit("个", 1)
@@ -210,12 +180,12 @@ public class AggApplication {
                 pb.step();
 
                 if (taskNum == currentTaskProgress) {
-                   break;
+                    break;
                 }
             }
         }
 
-        executorService.awaitTermination(1, TimeUnit.MINUTES);
+        executorService.shutdown();
         System.out.println("聚合任务执行完成");
     }