Эх сурвалжийг харах

feat:优化候选词获取方式;优化进度显示

ChenYL 1 жил өмнө
parent
commit
d06baa665f

+ 6 - 1
pom.xml

@@ -39,7 +39,12 @@
             <artifactId>caffeine</artifactId>
             <version>3.1.8</version>
         </dependency>
-
+        <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-core -->
+        <dependency>
+            <groupId>cn.hutool</groupId>
+            <artifactId>hutool-core</artifactId>
+            <version>5.8.25</version>
+        </dependency>
     </dependencies>
 
     <!-- 配置阿里云仓库 -->

+ 1 - 1
src/main/java/top/zhixinghe1/money/agg/Agg.java

@@ -77,7 +77,7 @@ public class Agg {
                     currentTaskProgress ++ ;
                 }
                 // 更新发呆进度
-                pb.step();
+                pb.stepBy(take.getAggNum());
 
                 if (taskNum == currentTaskProgress) {
                     break;

+ 17 - 11
src/main/java/top/zhixinghe1/money/agg/CalRunnable.java

@@ -1,5 +1,6 @@
 package top.zhixinghe1.money.agg;
 
+import cn.hutool.core.math.Combination;
 import org.apache.commons.text.similarity.CosineSimilarity;
 import org.roaringbitmap.RoaringBitmap;
 import top.zhixinghe1.money.agg.entity.CalInfo;
@@ -11,6 +12,7 @@ import top.zhixinghe1.money.agg.entity.Word;
 import java.math.BigDecimal;
 import java.math.RoundingMode;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Objects;
 import java.util.concurrent.LinkedBlockingQueue;
@@ -55,7 +57,8 @@ public class CalRunnable implements Runnable {
                 // 计算聚合结果
                 List<String> result = cal(pos);
                 // 返回计算结果
-                CalResult calResult = Objects.nonNull(result) ? new CalResult(true, result) : new CalResult(false, null);
+                CalResult calResult = result.size() >= calInfo.getAggResultThreshold() ? new CalResult(true, result) : new CalResult(false, null);
+                calResult.setAggNum(result.size() == 0 ? 1 : result.size());
                 calResult.setEndStatus(pos == end);
                 queue.put(calResult);
             }
@@ -71,22 +74,26 @@ public class CalRunnable implements Runnable {
      */
     private List<String> cal(int pos) {
         // 判断是否已进行计算
-        if (!calResource.checkAndSetCalStatus(pos)) {
-            return null;
+        if (calResource.checkAndSetCalStatus(pos)) {
+            return Collections.EMPTY_LIST;
         }
 
         // 获取主词
         Word word = calResource.getWord(pos);
         if (Objects.isNull(word.getStemMap()) || word.getStemMap().size() == 0) {
-            return null;
+            return Collections.EMPTY_LIST;
         }
 
-        // 计算候选词位图
-        RoaringBitmap finalBitmap = new RoaringBitmap();
-        for (CharSequence stem : word.getStemMap().keySet()) {
-            RoaringBitmap stemBitmap = calResource.getWordBitmap((String) stem);
-            finalBitmap.or(stemBitmap);
+        // 计算候选词位图,组合下具有相关的关键词才进行计算
+        Combination combination = new Combination(word.getStemMap().keySet().toArray(String[]::new));
+        List<String[]> select = combination.select(2);
+        List<RoaringBitmap> andBitmapList = new ArrayList<>(select.size());
+        for (String[] strings : select) {
+            RoaringBitmap firstBitmap = calResource.getWordBitmap(strings[0]);
+            RoaringBitmap secondBitmap = calResource.getWordBitmap(strings[1]);
+            andBitmapList.add(RoaringBitmap.and(firstBitmap, secondBitmap));
         }
+        RoaringBitmap finalBitmap = RoaringBitmap.or(andBitmapList.iterator());
         finalBitmap.andNot(calResource.getUsedBitmap());
 
         // 设置主词
@@ -109,7 +116,6 @@ public class CalRunnable implements Runnable {
             result.add(candidateWord.getKey());
         }
 
-        // 输出计算结果
-        return result.size() >= calInfo.getAggResultThreshold() ? result : null;
+        return result;
     }
 }

+ 0 - 9
src/main/java/top/zhixinghe1/money/agg/entity/CalInfo.java

@@ -41,11 +41,6 @@ public class CalInfo {
      */
     private Integer reverseIndexTotalNum;
 
-    /**
-     * 缓存最大数量
-     */
-    public Integer cacheMaximumSize = 50 * 10000;
-
     /**
      * 聚合阈值
      */
@@ -101,10 +96,6 @@ public class CalInfo {
         return dataDirPath;
     }
 
-    public Integer getCacheMaximumSize() {
-        return cacheMaximumSize;
-    }
-
     public BigDecimal getAggThreshold() {
         return aggThreshold;
     }

+ 21 - 9
src/main/java/top/zhixinghe1/money/agg/entity/CalResource.java

@@ -65,6 +65,11 @@ public class CalResource {
      */
     private ReentrantReadWriteLock.WriteLock writeLock = reentrantReadWriteLock.writeLock();
 
+    /**
+     * 读锁
+     */
+    private ReentrantReadWriteLock.ReadLock readLock = reentrantReadWriteLock.readLock();
+
     /**
      * 长尾词+分词 随机读写
      */
@@ -85,15 +90,25 @@ public class CalResource {
     /**
      * 检测长尾词计算状态
      * @param pos
-     * @return
+     * @return true-已计算 false-未计算
      */
     public boolean checkAndSetCalStatus(int pos) {
+        boolean isCal;
         try {
-            writeLock.lock();
-            return usedBitmap.checkedAdd(pos);
+            readLock.lock();
+            isCal = usedBitmap.contains(pos);
         } finally {
-            writeLock.unlock();
+            readLock.unlock();
         }
+        if (!isCal) {
+            try {
+                writeLock.lock();
+                usedBitmap.add(pos);
+            } finally {
+                writeLock.unlock();
+            }
+        }
+        return isCal;
     }
 
     /**
@@ -135,11 +150,11 @@ public class CalResource {
     private void loadIndex(CalInfo calInfo) throws IOException {
         // 创建关键词位置缓存
         wordIndexArr = buildWordIndexArr(calInfo.getWordTotalNum(), calInfo.getWordStemFilePath());
-        wordCache = Caffeine.newBuilder().maximumSize(calInfo.cacheMaximumSize).build();
+        wordCache = Caffeine.newBuilder().maximumSize(300 * 10000).build();
 
         // 创建倒排索引缓存
         reverseIndexMap = buildReverseIndexMap(calInfo.getReverseIndexTotalNum(), calInfo.getReverseIndexFilePath());
-        reverseIndexCache = Caffeine.newBuilder().maximumSize(calInfo.cacheMaximumSize).build();
+        reverseIndexCache = Caffeine.newBuilder().maximumSize(100 * 10000).build();
     }
 
     /**
@@ -223,9 +238,6 @@ public class CalResource {
             }
             bufferedRandomAccessFile.seek(reverseIndexMap.get(s));
             String content = bufferedRandomAccessFile.readLine();
-            if (content == null) {
-                System.out.println("暂停");
-            }
             String lineContent = new String(content.getBytes("8859_1"), "UTF-8");
 
             String[] split = lineContent.substring(lineContent.indexOf(",") + 1).split(",");

+ 13 - 0
src/main/java/top/zhixinghe1/money/agg/entity/CalResult.java

@@ -17,6 +17,11 @@ public class CalResult {
      */
     private boolean aggStatus;
 
+    /**
+     * 聚合数量
+     */
+    private Integer aggNum;
+
     /**
      * 聚合结果
      */
@@ -50,4 +55,12 @@ public class CalResult {
     public void setEndStatus(boolean endStatus) {
         this.endStatus = endStatus;
     }
+
+    public Integer getAggNum() {
+        return aggNum;
+    }
+
+    public void setAggNum(Integer aggNum) {
+        this.aggNum = aggNum;
+    }
 }