%!s(int64=2) %!d(string=hai) anos · f88842e2f3
--- a/pom.xml
+++ b/pom.xml
@@ -27,6 +27,19 @@
 
				             <artifactId>progressbar</artifactId>
			
 
				             <version>0.10.0</version>
			
 
				         </dependency>
			
 
				+        <!-- 稀疏位图 -->
			
 
				+        <dependency>
			
 
				+            <groupId>org.roaringbitmap</groupId>
			
 
				+            <artifactId>RoaringBitmap</artifactId>
			
 
				+            <version>1.0.1</version>
			
 
				+        </dependency>
			
 
				+        <!--  缓存框架  -->
			
 
				+        <dependency>
			
 
				+            <groupId>com.github.ben-manes.caffeine</groupId>
			
 
				+            <artifactId>caffeine</artifactId>
			
 
				+            <version>3.1.8</version>
			
 
				+        </dependency>
			
 
				+
			
 
				     </dependencies>
			
 
				 
			
 
				     <!-- 配置阿里云仓库 -->
			
--- a/src/main/java/top/zhixinghe1/money/Application.java
+++ b/src/main/java/top/zhixinghe1/money/Application.java
@@ -16,7 +16,7 @@ public class Application {
 
				             throw new Exception(String.format("输入了非法程序名：%s", funcName));
			
 
				         }
			
 
				         if ("agg".equals(funcName)) {
			
 
				-            if (args.length != 2) {
			
 
				+            if (args.length < 2) {
			
 
				                 throw new Exception("Java长尾词聚合程序，输入了非法参数");
			
 
				             }
			
 
				             new Agg().process(args[1]);
			
--- a/src/main/java/top/zhixinghe1/money/agg/Agg.java
+++ b/src/main/java/top/zhixinghe1/money/agg/Agg.java
@@ -1,35 +1,33 @@
 
				 package top.zhixinghe1.money.agg;
			
 
				 
			
 
				+import com.github.benmanes.caffeine.cache.Caffeine;
			
 
				+import com.github.benmanes.caffeine.cache.LoadingCache;
			
 
				 import me.tongfei.progressbar.ProgressBar;
			
 
				 import me.tongfei.progressbar.ProgressBarBuilder;
			
 
				 import org.apache.commons.lang3.StringUtils;
			
 
				+import org.roaringbitmap.RoaringBitmap;
			
 
				+import top.zhixinghe1.money.agg.cache.ReverseIndexLoader;
			
 
				+import top.zhixinghe1.money.agg.cache.WordLoader;
			
 
				+import top.zhixinghe1.money.agg.entity.BufferedRandomAccessFile;
			
 
				 import top.zhixinghe1.money.agg.entity.CalResult;
			
 
				-import top.zhixinghe1.money.agg.entity.CalRunable;
			
 
				 import top.zhixinghe1.money.agg.entity.CalTask;
			
 
				 import top.zhixinghe1.money.agg.entity.Word;
			
 
				 
			
 
				 import java.io.BufferedReader;
			
 
				 import java.io.BufferedWriter;
			
 
				 import java.io.File;
			
 
				-import java.io.FileReader;
			
 
				 import java.io.FileWriter;
			
 
				-import java.text.SimpleDateFormat;
			
 
				+import java.io.IOException;
			
 
				+import java.io.RandomAccessFile;
			
 
				 import java.time.temporal.ChronoUnit;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.Arrays;
			
 
				-import java.util.BitSet;
			
 
				-import java.util.HashMap;
			
 
				 import java.util.List;
			
 
				 import java.util.Map;
			
 
				-import java.util.Set;
			
 
				-import java.util.concurrent.CopyOnWriteArraySet;
			
 
				+import java.util.concurrent.ConcurrentHashMap;
			
 
				 import java.util.concurrent.ExecutorService;
			
 
				 import java.util.concurrent.Executors;
			
 
				 import java.util.concurrent.LinkedBlockingQueue;
			
 
				-import java.util.function.Function;
			
 
				-import java.util.regex.Matcher;
			
 
				-import java.util.regex.Pattern;
			
 
				-import java.util.stream.Collectors;
			
 
				 
			
 
				 /**
			
 
				  * 文本聚合 程序
			
@@ -38,11 +36,7 @@ public class Agg {
 
				 
			
 
				     private static final int perTaskNum = 10000;
			
 
				 
			
 
				-    private static final Map<Integer, Word> wordCache = new HashMap();
			
 
				-
			
 
				-    private static final Map<String, Set<Integer>> indexCache = new HashMap();
			
 
				-
			
 
				-    private static BitSet bitmap = null;
			
 
				+    private static RoaringBitmap bitmap = null;
			
 
				 
			
 
				     public void process(String dataDirPath) throws Exception {
			
 
				         if (StringUtils.isBlank(dataDirPath)) {
			
@@ -56,7 +50,7 @@ public class Agg {
 
				         }
			
 
				 
			
 
				         // 判断关键资源文件是否存在
			
 
				-        List<String> fileNameList = Arrays.asList("长尾词.txt", "长尾词_分词.txt", "长尾词_倒排索引.txt");
			
 
				+        List<String> fileNameList = Arrays.asList(Constant.WORD_STEM_FILE, Constant.WORD_REVERSE_INDEX_FILE);
			
 
				         for (String fileName : fileNameList) {
			
 
				             String resFilePath = String.join(File.separator, dataDirPath, fileName);
			
 
				             File resfile = new File(resFilePath);
			
@@ -65,96 +59,41 @@ public class Agg {
 
				             }
			
 
				         }
			
 
				 
			
 
				-        int totalWord = 0;
			
 
				-
			
 
				-        Pattern pattern = Pattern.compile("([^,]+)");
			
 
				-
			
 
				-        // 构造关键词缓存
			
 
				-        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_分词.txt"));
			
 
				-             BufferedReader br = new BufferedReader(reader)) {
			
 
				-            String line = null;
			
 
				-            while ((line = br.readLine()) != null) {
			
 
				-                if (StringUtils.isBlank(line)) {
			
 
				-                    continue;
			
 
				-                }
			
 
				-
			
 
				-                // 记录总文本数
			
 
				-                totalWord ++;
			
 
				-
			
 
				-                // 提取关键词和分词
			
 
				-                Matcher matcher = pattern.matcher(line);
			
 
				-                if (!matcher.find()) {
			
 
				-                    continue;
			
 
				-                }
			
 
				-                String key = matcher.group();
			
 
				-                if (StringUtils.isBlank(key)) {
			
 
				-                    continue;
			
 
				-                }
			
 
				-
			
 
				-                List<String> stems = new ArrayList<>();
			
 
				-                while (matcher.find()) {
			
 
				-                    String stem = matcher.group();
			
 
				-                    if (StringUtils.isBlank(stem)) {
			
 
				-                        continue;
			
 
				-                    }
			
 
				-                    stems.add(stem);
			
 
				-                }
			
 
				-                Map<CharSequence, Integer> stemMap = stems.stream().collect(Collectors.toMap(Function.identity(), v -> 1, Integer::sum));
			
 
				-
			
 
				-                wordCache.put(totalWord, new Word(key, stemMap));
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        // 构建倒排索引缓存
			
 
				-        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_倒排索引.txt"));
			
 
				-             BufferedReader br = new BufferedReader(reader)) {
			
 
				-            String line = null;
			
 
				-            while ((line = br.readLine()) != null) {
			
 
				-                if (StringUtils.isBlank(line)) {
			
 
				-                    continue;
			
 
				-                }
			
 
				+        // 读取统计信息
			
 
				+        int wordTotalNum = 499995;
			
 
				+        int reverseIndexTotalNum = 88090;
			
 
				 
			
 
				-                // 提取关键词和分词
			
 
				-                Matcher matcher = pattern.matcher(line);
			
 
				-                if (!matcher.find()) {
			
 
				-                    continue;
			
 
				-                }
			
 
				-                String stem = matcher.group();
			
 
				-                if (StringUtils.isBlank(stem)) {
			
 
				-                    continue;
			
 
				-                }
			
 
				+        // 创建关键词位置缓存
			
 
				+        String wordFilePath = String.join(File.separator, dataDirPath, Constant.WORD_STEM_FILE);
			
 
				+        long[] lineContentIndex = getLineContentIndex(wordTotalNum, wordFilePath);
			
 
				+        LoadingCache<Integer, Word> wordCache = Caffeine.newBuilder()
			
 
				+                .maximumSize(1000000)
			
 
				+                .build(new WordLoader(dataDirPath, lineContentIndex));
			
 
				 
			
 
				-                Set<Integer> positions = new CopyOnWriteArraySet<>();
			
 
				-                while (matcher.find()) {
			
 
				-                    String position = matcher.group();
			
 
				-                    if (StringUtils.isBlank(position)) {
			
 
				-                        continue;
			
 
				-                    }
			
 
				-                    positions.add(Integer.valueOf(position));
			
 
				-                }
			
 
				-
			
 
				-                indexCache.put(stem, positions);
			
 
				-            }
			
 
				-        }
			
 
				+        // 创建倒排索引缓存
			
 
				+        String stemFilePath = String.join(File.separator, dataDirPath, Constant.WORD_REVERSE_INDEX_FILE);
			
 
				+        Map<String, long[]> lienContentIndexMap = getLienContentIndexMap(reverseIndexTotalNum, stemFilePath);
			
 
				+        LoadingCache<String, RoaringBitmap> indexCache = Caffeine.newBuilder()
			
 
				+                .maximumSize(1000000)
			
 
				+                .build(new ReverseIndexLoader(dataDirPath, lienContentIndexMap));
			
 
				 
			
 
				         // 初始化已处理位图
			
 
				-        bitmap = new BitSet(totalWord+1);
			
 
				+        bitmap = new RoaringBitmap();
			
 
				 
			
 
				         // 分割计算任务
			
 
				-        List<CalTask> calTasks = avgSplitTask(totalWord, perTaskNum);
			
 
				+        List<CalTask> calTasks = avgSplitTask(wordTotalNum, perTaskNum);
			
 
				 
			
 
				         LinkedBlockingQueue<CalResult> queue = new LinkedBlockingQueue();
			
 
				 
			
 
				         // 提交任务
			
 
				         ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
			
 
				         for (CalTask calTask : calTasks) {
			
 
				-            executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), wordCache, indexCache, bitmap, queue));
			
 
				+            executorService.submit(new CalRunnable(calTask.getStartPos(), calTask.getEndPos(), wordCache, indexCache, bitmap, queue));
			
 
				         }
			
 
				 
			
 
				-        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
			
 
				-        String aggFilePath = String.join(File.separator, dataDirPath, "长尾词_聚合结果_临时.txt");
			
 
				+        String aggFilePath = String.join(File.separator, dataDirPath, Constant.WORD_AGG_RESULT_TEMP_FILE);
			
 
				         ProgressBarBuilder progressBarBuilder = new ProgressBarBuilder().setTaskName("文本聚合计算")
			
 
				-                .setInitialMax(totalWord)
			
 
				+                .setInitialMax(wordTotalNum)
			
 
				                 .setUnit("个", 1)
			
 
				                 .setSpeedUnit(ChronoUnit.SECONDS)
			
 
				                 .showSpeed();
			
@@ -204,4 +143,32 @@ public class Agg {
 
				         }
			
 
				         return calTasks;
			
 
				     }
			
 
				+
			
 
				+    private long[] getLineContentIndex(int wordTotalNum, String filePath) throws IOException {
			
 
				+        BufferedRandomAccessFile randomAccessFile = new BufferedRandomAccessFile(filePath, "r");
			
 
				+        long[] lineContentIndex = new long[wordTotalNum+1];
			
 
				+        for (int i = 1; i <= wordTotalNum; i++, randomAccessFile.readLine()) {
			
 
				+            lineContentIndex[i] = randomAccessFile.getFilePointer();
			
 
				+        }
			
 
				+        return lineContentIndex;
			
 
				+    }
			
 
				+
			
 
				+    private ConcurrentHashMap<String, long[]> getLienContentIndexMap(int stemTotalNum, String filePath) throws IOException {
			
 
				+//        BufferedRandomAccessFile randomAccessFile = new BufferedRandomAccessFile(filePath, "r");
			
 
				+        // TODO 这里太慢了
			
 
				+        RandomAccessFile randomAccessFile = new RandomAccessFile(filePath, "r");
			
 
				+        ConcurrentHashMap<String, long[]> lineContentIndexMap = new ConcurrentHashMap<>(stemTotalNum);
			
 
				+        String line = null;
			
 
				+        int cnt = 0;
			
 
				+        while ((line = randomAccessFile.readLine()) != null) {
			
 
				+            line = new String(line.getBytes("8859_1"), "UTF-8");
			
 
				+            String stem = line.substring(0, line.indexOf(","));
			
 
				+            long[] basicInfo = new long[2];
			
 
				+            basicInfo[0] = randomAccessFile.getFilePointer();
			
 
				+            basicInfo[1] = line.getBytes().length;
			
 
				+            lineContentIndexMap.put(stem, basicInfo);
			
 
				+            cnt++;
			
 
				+        }
			
 
				+        return lineContentIndexMap;
			
 
				+    }
			
 
				 }
			
--- a/src/main/java/top/zhixinghe1/money/agg/entity/CalRunable.java
+++ b/src/main/java/top/zhixinghe1/money/agg/entity/CalRunable.java
@@ -1,13 +1,14 @@
 
				-package top.zhixinghe1.money.agg.entity;
			
 
				+package top.zhixinghe1.money.agg;
			
 
				 
			
 
				+import com.github.benmanes.caffeine.cache.LoadingCache;
			
 
				 import org.apache.commons.text.similarity.CosineSimilarity;
			
 
				+import org.roaringbitmap.RoaringBitmap;
			
 
				+import top.zhixinghe1.money.agg.entity.CalResult;
			
 
				+import top.zhixinghe1.money.agg.entity.Word;
			
 
				 
			
 
				 import java.util.ArrayList;
			
 
				-import java.util.BitSet;
			
 
				-import java.util.HashMap;
			
 
				 import java.util.HashSet;
			
 
				 import java.util.List;
			
 
				-import java.util.Map;
			
 
				 import java.util.Objects;
			
 
				 import java.util.Set;
			
 
				 import java.util.concurrent.LinkedBlockingQueue;
			
@@ -15,17 +16,17 @@ import java.util.concurrent.LinkedBlockingQueue;
 
				 /**
			
 
				  * 计算任务对象
			
 
				  */
			
 
				-public class CalRunable implements Runnable {
			
 
				+public class CalRunnable implements Runnable {
			
 
				 
			
 
				     private int start;
			
 
				 
			
 
				     private int end;
			
 
				 
			
 
				-    private Map<Integer, Word> wordCache = new HashMap();
			
 
				+    private LoadingCache<Integer, Word> wordCache;
			
 
				 
			
 
				-    private Map<String, Set<Integer>> indexCache = new HashMap();
			
 
				+    private LoadingCache<String, RoaringBitmap> indexCache;
			
 
				 
			
 
				-    private BitSet bitmap = null;
			
 
				+    private RoaringBitmap bitmap;
			
 
				 
			
 
				     private CosineSimilarity cosineSimilarity = new CosineSimilarity();
			
 
				 
			
@@ -33,10 +34,9 @@ public class CalRunable implements Runnable {
 
				 
			
 
				     private LinkedBlockingQueue<CalResult> queue;
			
 
				 
			
 
				-    private Set<Integer> indexSet = new HashSet<>();
			
 
				     private List<String> result = new ArrayList<>();
			
 
				 
			
 
				-    public CalRunable(int start, int end, Map<Integer, Word> wordCache, Map<String, Set<Integer>> indexCache, BitSet bitmap, LinkedBlockingQueue<CalResult> queue) {
			
 
				+    public CalRunnable(int start, int end, LoadingCache<Integer, Word> wordCache, LoadingCache<String, RoaringBitmap> indexCache, RoaringBitmap bitmap, LinkedBlockingQueue<CalResult> queue) {
			
 
				         this.start = start;
			
 
				         this.end = end;
			
 
				         this.wordCache = wordCache;
			
@@ -49,7 +49,7 @@ public class CalRunable implements Runnable {
 
				     public void run() {
			
 
				         try {
			
 
				             for (int i = start; i <= end; i++) {
			
 
				-                CalResult calResult = null;
			
 
				+                CalResult calResult;
			
 
				                 if (cal(i)) {
			
 
				                     calResult = new CalResult(true, new ArrayList<>(result));
			
 
				                 } else {
			
@@ -65,31 +65,28 @@ public class CalRunable implements Runnable {
 
				 
			
 
				     private boolean cal(int i) {
			
 
				         // 判断是否已进行计算
			
 
				-        if (bitmap.get(i)) {
			
 
				+        if (bitmap.contains(i)) {
			
 
				             return false;
			
 
				         }
			
 
				 
			
 
				         // 清除上一轮的数据
			
 
				-        indexSet.clear();
			
 
				         result.clear();
			
 
				 
			
 
				         Word word = wordCache.get(i);
			
 
				         if (Objects.isNull(word.getStemMap()) || word.getStemMap().size() == 0) {
			
 
				             return false;
			
 
				         }
			
 
				-        bitmap.set(i, true);
			
 
				+        bitmap.add(i);
			
 
				         result.add(word.getKey());
			
 
				+
			
 
				+        RoaringBitmap finalBitmap = new RoaringBitmap();
			
 
				         for (CharSequence stem : word.getStemMap().keySet()) {
			
 
				-            Set<Integer> positions = indexCache.get(stem);
			
 
				-            for (Integer position : positions) {
			
 
				-                if (bitmap.get(position)) {
			
 
				-                    positions.remove(position);
			
 
				-                } else {
			
 
				-                    indexSet.add(position);
			
 
				-                }
			
 
				-            }
			
 
				+            RoaringBitmap stemBitmap = indexCache.get((String) stem);
			
 
				+            finalBitmap.or(stemBitmap);
			
 
				         }
			
 
				-        for (Integer index : indexSet) {
			
 
				+        finalBitmap.andNot(bitmap);
			
 
				+
			
 
				+        for (Integer index : finalBitmap) {
			
 
				             Word candicateWord = wordCache.get(index);
			
 
				             if (Objects.isNull(candicateWord.getStemMap())) {
			
 
				                 continue;
			
@@ -98,6 +95,7 @@ public class CalRunable implements Runnable {
 
				             if (v < aggThreshold) {
			
 
				                 continue;
			
 
				             }
			
 
				+            bitmap.add(index);
			
 
				             result.add(candicateWord.getKey());
			
 
				         }
			
 
				         // 输出计算结果
			
--- a/src/main/java/top/zhixinghe1/money/agg/Constant.java
+++ b/src/main/java/top/zhixinghe1/money/agg/Constant.java
@@ -0,0 +1,22 @@
 
				+package top.zhixinghe1.money.agg;
			
 
				+
			
 
				+/**
			
 
				+ * 常量
			
 
				+ */
			
 
				+public class Constant {
			
 
				+
			
 
				+    /**
			
 
				+     * 长尾词分词文件
			
 
				+     */
			
 
				+    public static final String WORD_STEM_FILE = "长尾词_分词.txt";
			
 
				+
			
 
				+    /**
			
 
				+     * 长尾词倒排索引文件
			
 
				+     */
			
 
				+    public static final String WORD_REVERSE_INDEX_FILE = "长尾词_倒排索引.txt";
			
 
				+
			
 
				+    /**
			
 
				+     * 长尾词聚合结果临时文件
			
 
				+     */
			
 
				+    public static final String WORD_AGG_RESULT_TEMP_FILE = "长尾词_聚合结果_临时.txt";
			
 
				+}
			
--- a/src/main/java/top/zhixinghe1/money/agg/Test.java
+++ b/src/main/java/top/zhixinghe1/money/agg/Test.java
@@ -0,0 +1,18 @@
 
				+package top.zhixinghe1.money.agg;
			
 
				+
			
 
				+import org.roaringbitmap.ImmutableBitmapDataProvider;
			
 
				+import org.roaringbitmap.IntConsumer;
			
 
				+import org.roaringbitmap.RoaringBitmap;
			
 
				+import org.roaringbitmap.buffer.MutableRoaringBitmap;
			
 
				+import org.roaringbitmap.longlong.IntegerUtil;
			
 
				+
			
 
				+import java.util.HashMap;
			
 
				+
			
 
				+public class Test {
			
 
				+    public static void main(String[] args) {
			
 
				+        RoaringBitmap integers = RoaringBitmap.bitmapOf(1, 3, 7, 10);
			
 
				+        for (Integer integer : integers) {
			
 
				+            System.out.println(integer);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/top/zhixinghe1/money/agg/cache/ReverseIndexLoader.java
+++ b/src/main/java/top/zhixinghe1/money/agg/cache/ReverseIndexLoader.java
@@ -0,0 +1,68 @@
 
				+package top.zhixinghe1.money.agg.cache;
			
 
				+
			
 
				+import com.github.benmanes.caffeine.cache.CacheLoader;
			
 
				+import org.checkerframework.checker.nullness.qual.Nullable;
			
 
				+import org.roaringbitmap.RoaringBitmap;
			
 
				+import top.zhixinghe1.money.agg.Constant;
			
 
				+import top.zhixinghe1.money.agg.entity.BufferedRandomAccessFile;
			
 
				+
			
 
				+import java.io.File;
			
 
				+import java.io.IOException;
			
 
				+import java.io.RandomAccessFile;
			
 
				+import java.nio.MappedByteBuffer;
			
 
				+import java.nio.channels.FileChannel;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+public class ReverseIndexLoader implements CacheLoader<String, RoaringBitmap> {
			
 
				+
			
 
				+    /**
			
 
				+     * 数据目录路径
			
 
				+     */
			
 
				+    private String dataPath;
			
 
				+
			
 
				+    /**
			
 
				+     * 行索引
			
 
				+     */
			
 
				+    private Map<String, long[]> lineIndex;
			
 
				+
			
 
				+    /**
			
 
				+     * 文件读写器
			
 
				+     */
			
 
				+//    private BufferedRandomAccessFile randomAccessFile;
			
 
				+    private RandomAccessFile randomAccessFile;
			
 
				+
			
 
				+    private MappedByteBuffer mappedByteBuffer;
			
 
				+
			
 
				+    public ReverseIndexLoader(String dataPath, Map<String, long[]> lineIndex) throws IOException {
			
 
				+        this.dataPath = dataPath;
			
 
				+        this.lineIndex = lineIndex;
			
 
				+        String filePath = String.join(File.separator, dataPath, Constant.WORD_REVERSE_INDEX_FILE);
			
 
				+//        randomAccessFile = new BufferedRandomAccessFile(filePath, "r", 10*1024*1024);
			
 
				+        randomAccessFile = new RandomAccessFile(filePath, "r");
			
 
				+        MappedByteBuffer mappedByteBuffer = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, Integer.MAX_VALUE);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public @Nullable RoaringBitmap load(String s) throws Exception {
			
 
				+        long[] basicInfo = lineIndex.get(s);
			
 
				+//        randomAccessFile.seek();
			
 
				+//        String lineContent = new String(randomAccessFile.readLine().getBytes("8859_1"), "UTF-8");
			
 
				+//        String lineContent = randomAccessFile.readUTF();
			
 
				+//        mappedByteBuffer.position((int) basicInfo[0]);
			
 
				+        byte[] buff = new byte[(int) basicInfo[2]];
			
 
				+        mappedByteBuffer.put(buff, (int)basicInfo[0], (int)basicInfo[2]);
			
 
				+        String lineContent = new String(buff);
			
 
				+
			
 
				+        String[] split = lineContent.substring(lineContent.indexOf(",") + 1).split(",");
			
 
				+        int[] positionIntegers = new int[split.length];
			
 
				+        for (int i = 0; i < split.length; i++) {
			
 
				+            try {
			
 
				+                positionIntegers[i] = Integer.parseInt(split[i]);
			
 
				+            } catch (Exception e) {
			
 
				+                System.out.println("暂停");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return RoaringBitmap.bitmapOf(positionIntegers);
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/top/zhixinghe1/money/agg/cache/WordLoader.java
+++ b/src/main/java/top/zhixinghe1/money/agg/cache/WordLoader.java
@@ -0,0 +1,72 @@
 
				+package top.zhixinghe1.money.agg.cache;
			
 
				+
			
 
				+import com.github.benmanes.caffeine.cache.CacheLoader;
			
 
				+import org.checkerframework.checker.nullness.qual.Nullable;
			
 
				+import top.zhixinghe1.money.agg.Constant;
			
 
				+import top.zhixinghe1.money.agg.entity.BufferedRandomAccessFile;
			
 
				+import top.zhixinghe1.money.agg.entity.Word;
			
 
				+
			
 
				+import java.io.File;
			
 
				+import java.io.IOException;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.Map;
			
 
				+import java.util.function.Function;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+public class WordLoader implements CacheLoader<Integer, Word> {
			
 
				+
			
 
				+    /**
			
 
				+     * 数据目录路径
			
 
				+     */
			
 
				+    private String dataPath;
			
 
				+
			
 
				+    /**
			
 
				+     * 行索引
			
 
				+     */
			
 
				+    private long[] lineIndex;
			
 
				+
			
 
				+    /**
			
 
				+     * 文件读写器
			
 
				+     */
			
 
				+    private BufferedRandomAccessFile randomAccessFile;
			
 
				+
			
 
				+    public WordLoader(String dataPath, long[] lineIndex) throws IOException {
			
 
				+        this.dataPath = dataPath;
			
 
				+        this.lineIndex = lineIndex;
			
 
				+        String filePath = String.join(File.separator, dataPath, Constant.WORD_STEM_FILE);
			
 
				+        randomAccessFile = new BufferedRandomAccessFile(filePath, "r", 10*1024*1024);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public @Nullable Word load(Integer integer) throws Exception {
			
 
				+        randomAccessFile.seek(lineIndex[integer]);
			
 
				+        String lineContent = new String(randomAccessFile.readLine().getBytes("8859_1"), "UTF-8");
			
 
				+
			
 
				+        // 提取关键词和分词
			
 
				+        int keyPosition = lineContent.indexOf(",");
			
 
				+        String key = keyPosition == -1 ? lineContent : lineContent.substring(0, keyPosition);
			
 
				+        Map<CharSequence, Integer> stemMap = Collections.EMPTY_MAP;
			
 
				+        if (keyPosition != -1 && lineContent.length() != keyPosition + 1) {
			
 
				+            stemMap = Arrays.asList(lineContent.substring(keyPosition+1).split(","))
			
 
				+                    .stream().collect(Collectors.toMap(Function.identity(), v -> 1, Integer::sum));
			
 
				+        }
			
 
				+
			
 
				+        return new Word(key, stemMap);
			
 
				+    }
			
 
				+
			
 
				+    public static void main(String[] args) throws Exception {
			
 
				+        int wordTotalNum = 500000;
			
 
				+        String dataPath = "E:\\ChenYL\\CodeRepository\\money-mining-python\\data\\test\\长尾词聚合分析_20240131224320";
			
 
				+        String filePath = String.join(File.separator, dataPath, Constant.WORD_STEM_FILE);
			
 
				+        BufferedRandomAccessFile randomAccessFile = new BufferedRandomAccessFile(filePath, "r");
			
 
				+        long[] lineContentIndex = new long[wordTotalNum+1];
			
 
				+        for (int i = 1; i <= wordTotalNum; i++, randomAccessFile.readLine()) {
			
 
				+            lineContentIndex[i] = randomAccessFile.getFilePointer();
			
 
				+        }
			
 
				+
			
 
				+        WordLoader wordLoader = new WordLoader(dataPath, lineContentIndex);
			
 
				+        Word load = wordLoader.load(1000);
			
 
				+        System.out.println("暂停");
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/top/zhixinghe1/money/agg/entity/BufferedRandomAccessFile.java
+++ b/src/main/java/top/zhixinghe1/money/agg/entity/BufferedRandomAccessFile.java
@@ -0,0 +1,358 @@
 
				+package top.zhixinghe1.money.agg.entity;
			
 
				+
			
 
				+import java.io.File;
			
 
				+import java.io.FileNotFoundException;
			
 
				+import java.io.IOException;
			
 
				+import java.io.RandomAccessFile;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.logging.Logger;
			
 
				+
			
 
				+/**
			
 
				+ * A <code>BufferedRandomAccessFile</code> is like a
			
 
				+ * <code>RandomAccessFile</code>, but it uses a private buffer so that most
			
 
				+ * operations do not require a disk access.
			
 
				+ * <P>
			
 
				+ *
			
 
				+ * Note: The operations on this class are unmonitored. Also, the correct
			
 
				+ * functioning of the <code>RandomAccessFile</code> methods that are not
			
 
				+ * overridden here relies on the implementation of those methods in the
			
 
				+ * superclass.
			
 
				+ * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
			
 
				+ */
			
 
				+
			
 
				+public final class BufferedRandomAccessFile extends RandomAccessFile
			
 
				+{
			
 
				+    static final int LogBuffSz_ = 16; // 64K buffer
			
 
				+    public static final int BuffSz_ = (1 << LogBuffSz_);
			
 
				+    static final long BuffMask_ = ~(((long) BuffSz_) - 1L);
			
 
				+
			
 
				+    /*
			
 
				+     * This implementation is based on the buffer implementation in Modula-3's
			
 
				+     * "Rd", "Wr", "RdClass", and "WrClass" interfaces.
			
 
				+     */
			
 
				+    private boolean dirty_; // true iff unflushed bytes exist
			
 
				+    private boolean closed_; // true iff the file is closed
			
 
				+    private long curr_; // current position in file
			
 
				+    private long lo_, hi_; // bounds on characters in "buff"
			
 
				+    private byte[] buff_; // local buffer
			
 
				+    private long maxHi_; // this.lo + this.buff.length
			
 
				+    private boolean hitEOF_; // buffer contains last file block?
			
 
				+    private long diskPos_; // disk position
			
 
				+
			
 
				+    /*
			
 
				+     * To describe the above fields, we introduce the following abstractions for
			
 
				+     * the file "f":
			
 
				+     *
			
 
				+     * len(f) the length of the file curr(f) the current position in the file
			
 
				+     * c(f) the abstract contents of the file disk(f) the contents of f's
			
 
				+     * backing disk file closed(f) true iff the file is closed
			
 
				+     *
			
 
				+     * "curr(f)" is an index in the closed interval [0, len(f)]. "c(f)" is a
			
 
				+     * character sequence of length "len(f)". "c(f)" and "disk(f)" may differ if
			
 
				+     * "c(f)" contains unflushed writes not reflected in "disk(f)". The flush
			
 
				+     * operation has the effect of making "disk(f)" identical to "c(f)".
			
 
				+     *
			
 
				+     * A file is said to be *valid* if the following conditions hold:
			
 
				+     *
			
 
				+     * V1. The "closed" and "curr" fields are correct:
			
 
				+     *
			
 
				+     * f.closed == closed(f) f.curr == curr(f)
			
 
				+     *
			
 
				+     * V2. The current position is either contained in the buffer, or just past
			
 
				+     * the buffer:
			
 
				+     *
			
 
				+     * f.lo <= f.curr <= f.hi
			
 
				+     *
			
 
				+     * V3. Any (possibly) unflushed characters are stored in "f.buff":
			
 
				+     *
			
 
				+     * (forall i in [f.lo, f.curr): c(f)[i] == f.buff[i - f.lo])
			
 
				+     *
			
 
				+     * V4. For all characters not covered by V3, c(f) and disk(f) agree:
			
 
				+     *
			
 
				+     * (forall i in [f.lo, len(f)): i not in [f.lo, f.curr) => c(f)[i] ==
			
 
				+     * disk(f)[i])
			
 
				+     *
			
 
				+     * V5. "f.dirty" is true iff the buffer contains bytes that should be
			
 
				+     * flushed to the file; by V3 and V4, only part of the buffer can be dirty.
			
 
				+     *
			
 
				+     * f.dirty == (exists i in [f.lo, f.curr): c(f)[i] != f.buff[i - f.lo])
			
 
				+     *
			
 
				+     * V6. this.maxHi == this.lo + this.buff.length
			
 
				+     *
			
 
				+     * Note that "f.buff" can be "null" in a valid file, since the range of
			
 
				+     * characters in V3 is empty when "f.lo == f.curr".
			
 
				+     *
			
 
				+     * A file is said to be *ready* if the buffer contains the current position,
			
 
				+     * i.e., when:
			
 
				+     *
			
 
				+     * R1. !f.closed && f.buff != null && f.lo <= f.curr && f.curr < f.hi
			
 
				+     *
			
 
				+     * When a file is ready, reading or writing a single byte can be performed
			
 
				+     * by reading or writing the in-memory buffer without performing a disk
			
 
				+     * operation.
			
 
				+     */
			
 
				+
			
 
				+    /**
			
 
				+     * Open a new <code>BufferedRandomAccessFile</code> on <code>file</code>
			
 
				+     * in mode <code>mode</code>, which should be "r" for reading only, or
			
 
				+     * "rw" for reading and writing.
			
 
				+     */
			
 
				+    public BufferedRandomAccessFile(File file, String mode) throws IOException
			
 
				+    {
			
 
				+        super(file, mode);
			
 
				+        this.init(0);
			
 
				+    }
			
 
				+
			
 
				+    public BufferedRandomAccessFile(File file, String mode, int size) throws IOException
			
 
				+    {
			
 
				+        super(file, mode);
			
 
				+        this.init(size);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Open a new <code>BufferedRandomAccessFile</code> on the file named
			
 
				+     * <code>name</code> in mode <code>mode</code>, which should be "r" for
			
 
				+     * reading only, or "rw" for reading and writing.
			
 
				+     */
			
 
				+    public BufferedRandomAccessFile(String name, String mode) throws IOException
			
 
				+    {
			
 
				+        super(name, mode);
			
 
				+        this.init(0);
			
 
				+    }
			
 
				+
			
 
				+    public BufferedRandomAccessFile(String name, String mode, int size) throws FileNotFoundException
			
 
				+    {
			
 
				+        super(name, mode);
			
 
				+        this.init(size);
			
 
				+    }
			
 
				+
			
 
				+    private void init(int size)
			
 
				+    {
			
 
				+        this.dirty_ = this.closed_ = false;
			
 
				+        this.lo_ = this.curr_ = this.hi_ = 0;
			
 
				+        this.buff_ = (size > BuffSz_) ? new byte[size] : new byte[BuffSz_];
			
 
				+        this.maxHi_ = (long) BuffSz_;
			
 
				+        this.hitEOF_ = false;
			
 
				+        this.diskPos_ = 0L;
			
 
				+    }
			
 
				+
			
 
				+    public void close() throws IOException
			
 
				+    {
			
 
				+        this.flush();
			
 
				+        this.closed_ = true;
			
 
				+        super.close();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Flush any bytes in the file's buffer that have not yet been written to
			
 
				+     * disk. If the file was created read-only, this method is a no-op.
			
 
				+     */
			
 
				+    public void flush() throws IOException
			
 
				+    {
			
 
				+        this.flushBuffer();
			
 
				+    }
			
 
				+
			
 
				+    /* Flush any dirty bytes in the buffer to disk. */
			
 
				+    private void flushBuffer() throws IOException
			
 
				+    {
			
 
				+        if (this.dirty_)
			
 
				+        {
			
 
				+            if (this.diskPos_ != this.lo_)
			
 
				+                super.seek(this.lo_);
			
 
				+            int len = (int) (this.curr_ - this.lo_);
			
 
				+            super.write(this.buff_, 0, len);
			
 
				+            this.diskPos_ = this.curr_;
			
 
				+            this.dirty_ = false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /*
			
 
				+     * Read at most "this.buff.length" bytes into "this.buff", returning the
			
 
				+     * number of bytes read. If the return result is less than
			
 
				+     * "this.buff.length", then EOF was read.
			
 
				+     */
			
 
				+    private int fillBuffer() throws IOException
			
 
				+    {
			
 
				+        int cnt = 0;
			
 
				+        int rem = this.buff_.length;
			
 
				+        while (rem > 0)
			
 
				+        {
			
 
				+            int n = super.read(this.buff_, cnt, rem);
			
 
				+            if (n < 0)
			
 
				+                break;
			
 
				+            cnt += n;
			
 
				+            rem -= n;
			
 
				+        }
			
 
				+        if ( (cnt < 0) && (this.hitEOF_ = (cnt < this.buff_.length)) )
			
 
				+        {
			
 
				+            // make sure buffer that wasn't read is initialized with -1
			
 
				+            Arrays.fill(this.buff_, cnt, this.buff_.length, (byte) 0xff);
			
 
				+        }
			
 
				+        this.diskPos_ += cnt;
			
 
				+        return cnt;
			
 
				+    }
			
 
				+
			
 
				+    /*
			
 
				+     * This method positions <code>this.curr</code> at position <code>pos</code>.
			
 
				+     * If <code>pos</code> does not fall in the current buffer, it flushes the
			
 
				+     * current buffer and loads the correct one.<p>
			
 
				+     *
			
 
				+     * On exit from this routine <code>this.curr == this.hi</code> iff <code>pos</code>
			
 
				+     * is at or past the end-of-file, which can only happen if the file was
			
 
				+     * opened in read-only mode.
			
 
				+     */
			
 
				+    public void seek(long pos) throws IOException
			
 
				+    {
			
 
				+        if (pos >= this.hi_ || pos < this.lo_)
			
 
				+        {
			
 
				+            // seeking outside of current buffer -- flush and read
			
 
				+            this.flushBuffer();
			
 
				+            this.lo_ = pos & BuffMask_; // start at BuffSz boundary
			
 
				+            this.maxHi_ = this.lo_ + (long) this.buff_.length;
			
 
				+            if (this.diskPos_ != this.lo_)
			
 
				+            {
			
 
				+                super.seek(this.lo_);
			
 
				+                this.diskPos_ = this.lo_;
			
 
				+            }
			
 
				+            int n = this.fillBuffer();
			
 
				+            this.hi_ = this.lo_ + (long) n;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            // seeking inside current buffer -- no read required
			
 
				+            if (pos < this.curr_)
			
 
				+            {
			
 
				+                // if seeking backwards, we must flush to maintain V4
			
 
				+                this.flushBuffer();
			
 
				+            }
			
 
				+        }
			
 
				+        this.curr_ = pos;
			
 
				+    }
			
 
				+
			
 
				+    public long getFilePointer()
			
 
				+    {
			
 
				+        return this.curr_;
			
 
				+    }
			
 
				+
			
 
				+    public long length() throws IOException
			
 
				+    {
			
 
				+        return Math.max(this.curr_, super.length());
			
 
				+    }
			
 
				+
			
 
				+    public int read() throws IOException
			
 
				+    {
			
 
				+        if (this.curr_ >= this.hi_)
			
 
				+        {
			
 
				+            // test for EOF
			
 
				+            // if (this.hi < this.maxHi) return -1;
			
 
				+            if (this.hitEOF_)
			
 
				+                return -1;
			
 
				+
			
 
				+            // slow path -- read another buffer
			
 
				+            this.seek(this.curr_);
			
 
				+            if (this.curr_ == this.hi_)
			
 
				+                return -1;
			
 
				+        }
			
 
				+        byte res = this.buff_[(int) (this.curr_ - this.lo_)];
			
 
				+        this.curr_++;
			
 
				+        return ((int) res) & 0xFF; // convert byte -> int
			
 
				+    }
			
 
				+
			
 
				+    public int read(byte[] b) throws IOException
			
 
				+    {
			
 
				+        return this.read(b, 0, b.length);
			
 
				+    }
			
 
				+
			
 
				+    public int read(byte[] b, int off, int len) throws IOException
			
 
				+    {
			
 
				+        if (this.curr_ >= this.hi_)
			
 
				+        {
			
 
				+            // test for EOF
			
 
				+            // if (this.hi < this.maxHi) return -1;
			
 
				+            if (this.hitEOF_)
			
 
				+                return -1;
			
 
				+
			
 
				+            // slow path -- read another buffer
			
 
				+            this.seek(this.curr_);
			
 
				+            if (this.curr_ == this.hi_)
			
 
				+                return -1;
			
 
				+        }
			
 
				+        len = Math.min(len, (int) (this.hi_ - this.curr_));
			
 
				+        int buffOff = (int) (this.curr_ - this.lo_);
			
 
				+        System.arraycopy(this.buff_, buffOff, b, off, len);
			
 
				+        this.curr_ += len;
			
 
				+        return len;
			
 
				+    }
			
 
				+
			
 
				+    public void write(int b) throws IOException
			
 
				+    {
			
 
				+        if (this.curr_ >= this.hi_)
			
 
				+        {
			
 
				+            if (this.hitEOF_ && this.hi_ < this.maxHi_)
			
 
				+            {
			
 
				+                // at EOF -- bump "hi"
			
 
				+                this.hi_++;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                // slow path -- write current buffer; read next one
			
 
				+                this.seek(this.curr_);
			
 
				+                if (this.curr_ == this.hi_)
			
 
				+                {
			
 
				+                    // appending to EOF -- bump "hi"
			
 
				+                    this.hi_++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        this.buff_[(int) (this.curr_ - this.lo_)] = (byte) b;
			
 
				+        this.curr_++;
			
 
				+        this.dirty_ = true;
			
 
				+    }
			
 
				+
			
 
				+    public void write(byte[] b) throws IOException
			
 
				+    {
			
 
				+        this.write(b, 0, b.length);
			
 
				+    }
			
 
				+
			
 
				+    public void write(byte[] b, int off, int len) throws IOException
			
 
				+    {
			
 
				+        while (len > 0)
			
 
				+        {
			
 
				+            int n = this.writeAtMost(b, off, len);
			
 
				+            off += n;
			
 
				+            len -= n;
			
 
				+            this.dirty_ = true;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /*
			
 
				+     * Write at most "len" bytes to "b" starting at position "off", and return
			
 
				+     * the number of bytes written.
			
 
				+     */
			
 
				+    private int writeAtMost(byte[] b, int off, int len) throws IOException
			
 
				+    {
			
 
				+        if (this.curr_ >= this.hi_)
			
 
				+        {
			
 
				+            if (this.hitEOF_ && this.hi_ < this.maxHi_)
			
 
				+            {
			
 
				+                // at EOF -- bump "hi"
			
 
				+                this.hi_ = this.maxHi_;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                // slow path -- write current buffer; read next one
			
 
				+                this.seek(this.curr_);
			
 
				+                if (this.curr_ == this.hi_)
			
 
				+                {
			
 
				+                    // appending to EOF -- bump "hi"
			
 
				+                    this.hi_ = this.maxHi_;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        len = Math.min(len, (int) (this.hi_ - this.curr_));
			
 
				+        int buffOff = (int) (this.curr_ - this.lo_);
			
 
				+        System.arraycopy(b, off, this.buff_, buffOff, len);
			
 
				+        this.curr_ += len;
			
 
				+        return len;
			
 
				+    }
			
 
				+}