Explorar o código

feat:优化性能

ChenYL %!s(int64=2) %!d(string=hai) anos
pai
achega
f88842e2f3

+ 13 - 0
pom.xml

@@ -27,6 +27,19 @@
             <artifactId>progressbar</artifactId>
             <version>0.10.0</version>
         </dependency>
+        <!-- 稀疏位图 -->
+        <dependency>
+            <groupId>org.roaringbitmap</groupId>
+            <artifactId>RoaringBitmap</artifactId>
+            <version>1.0.1</version>
+        </dependency>
+        <!--  缓存框架  -->
+        <dependency>
+            <groupId>com.github.ben-manes.caffeine</groupId>
+            <artifactId>caffeine</artifactId>
+            <version>3.1.8</version>
+        </dependency>
+
     </dependencies>
 
     <!-- 配置阿里云仓库 -->

+ 1 - 1
src/main/java/top/zhixinghe1/money/Application.java

@@ -16,7 +16,7 @@ public class Application {
             throw new Exception(String.format("输入了非法程序名:%s", funcName));
         }
         if ("agg".equals(funcName)) {
-            if (args.length != 2) {
+            if (args.length < 2) {
                 throw new Exception("Java长尾词聚合程序,输入了非法参数");
             }
             new Agg().process(args[1]);

+ 59 - 92
src/main/java/top/zhixinghe1/money/agg/Agg.java

@@ -1,35 +1,33 @@
 package top.zhixinghe1.money.agg;
 
+import com.github.benmanes.caffeine.cache.Caffeine;
+import com.github.benmanes.caffeine.cache.LoadingCache;
 import me.tongfei.progressbar.ProgressBar;
 import me.tongfei.progressbar.ProgressBarBuilder;
 import org.apache.commons.lang3.StringUtils;
+import org.roaringbitmap.RoaringBitmap;
+import top.zhixinghe1.money.agg.cache.ReverseIndexLoader;
+import top.zhixinghe1.money.agg.cache.WordLoader;
+import top.zhixinghe1.money.agg.entity.BufferedRandomAccessFile;
 import top.zhixinghe1.money.agg.entity.CalResult;
-import top.zhixinghe1.money.agg.entity.CalRunable;
 import top.zhixinghe1.money.agg.entity.CalTask;
 import top.zhixinghe1.money.agg.entity.Word;
 
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
-import java.io.FileReader;
 import java.io.FileWriter;
-import java.text.SimpleDateFormat;
+import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.BitSet;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.CopyOnWriteArraySet;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.LinkedBlockingQueue;
-import java.util.function.Function;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
 
 /**
  * 文本聚合 程序
@@ -38,11 +36,7 @@ public class Agg {
 
     private static final int perTaskNum = 10000;
 
-    private static final Map<Integer, Word> wordCache = new HashMap();
-
-    private static final Map<String, Set<Integer>> indexCache = new HashMap();
-
-    private static BitSet bitmap = null;
+    private static RoaringBitmap bitmap = null;
 
     public void process(String dataDirPath) throws Exception {
         if (StringUtils.isBlank(dataDirPath)) {
@@ -56,7 +50,7 @@ public class Agg {
         }
 
         // 判断关键资源文件是否存在
-        List<String> fileNameList = Arrays.asList("长尾词.txt", "长尾词_分词.txt", "长尾词_倒排索引.txt");
+        List<String> fileNameList = Arrays.asList(Constant.WORD_STEM_FILE, Constant.WORD_REVERSE_INDEX_FILE);
         for (String fileName : fileNameList) {
             String resFilePath = String.join(File.separator, dataDirPath, fileName);
             File resfile = new File(resFilePath);
@@ -65,96 +59,41 @@ public class Agg {
             }
         }
 
-        int totalWord = 0;
-
-        Pattern pattern = Pattern.compile("([^,]+)");
-
-        // 构造关键词缓存
-        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_分词.txt"));
-             BufferedReader br = new BufferedReader(reader)) {
-            String line = null;
-            while ((line = br.readLine()) != null) {
-                if (StringUtils.isBlank(line)) {
-                    continue;
-                }
-
-                // 记录总文本数
-                totalWord ++;
-
-                // 提取关键词和分词
-                Matcher matcher = pattern.matcher(line);
-                if (!matcher.find()) {
-                    continue;
-                }
-                String key = matcher.group();
-                if (StringUtils.isBlank(key)) {
-                    continue;
-                }
-
-                List<String> stems = new ArrayList<>();
-                while (matcher.find()) {
-                    String stem = matcher.group();
-                    if (StringUtils.isBlank(stem)) {
-                        continue;
-                    }
-                    stems.add(stem);
-                }
-                Map<CharSequence, Integer> stemMap = stems.stream().collect(Collectors.toMap(Function.identity(), v -> 1, Integer::sum));
-
-                wordCache.put(totalWord, new Word(key, stemMap));
-            }
-        }
-
-        // 构建倒排索引缓存
-        try (FileReader reader = new FileReader(String.join(File.separator, dataDirPath, "长尾词_倒排索引.txt"));
-             BufferedReader br = new BufferedReader(reader)) {
-            String line = null;
-            while ((line = br.readLine()) != null) {
-                if (StringUtils.isBlank(line)) {
-                    continue;
-                }
+        // 读取统计信息
+        int wordTotalNum = 499995;
+        int reverseIndexTotalNum = 88090;
 
-                // 提取关键词和分词
-                Matcher matcher = pattern.matcher(line);
-                if (!matcher.find()) {
-                    continue;
-                }
-                String stem = matcher.group();
-                if (StringUtils.isBlank(stem)) {
-                    continue;
-                }
+        // 创建关键词位置缓存
+        String wordFilePath = String.join(File.separator, dataDirPath, Constant.WORD_STEM_FILE);
+        long[] lineContentIndex = getLineContentIndex(wordTotalNum, wordFilePath);
+        LoadingCache<Integer, Word> wordCache = Caffeine.newBuilder()
+                .maximumSize(1000000)
+                .build(new WordLoader(dataDirPath, lineContentIndex));
 
-                Set<Integer> positions = new CopyOnWriteArraySet<>();
-                while (matcher.find()) {
-                    String position = matcher.group();
-                    if (StringUtils.isBlank(position)) {
-                        continue;
-                    }
-                    positions.add(Integer.valueOf(position));
-                }
-
-                indexCache.put(stem, positions);
-            }
-        }
+        // 创建倒排索引缓存
+        String stemFilePath = String.join(File.separator, dataDirPath, Constant.WORD_REVERSE_INDEX_FILE);
+        Map<String, long[]> lienContentIndexMap = getLienContentIndexMap(reverseIndexTotalNum, stemFilePath);
+        LoadingCache<String, RoaringBitmap> indexCache = Caffeine.newBuilder()
+                .maximumSize(1000000)
+                .build(new ReverseIndexLoader(dataDirPath, lienContentIndexMap));
 
         // 初始化已处理位图
-        bitmap = new BitSet(totalWord+1);
+        bitmap = new RoaringBitmap();
 
         // 分割计算任务
-        List<CalTask> calTasks = avgSplitTask(totalWord, perTaskNum);
+        List<CalTask> calTasks = avgSplitTask(wordTotalNum, perTaskNum);
 
         LinkedBlockingQueue<CalResult> queue = new LinkedBlockingQueue();
 
         // 提交任务
         ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
         for (CalTask calTask : calTasks) {
-            executorService.submit(new CalRunable(calTask.getStartPos(), calTask.getEndPos(), wordCache, indexCache, bitmap, queue));
+            executorService.submit(new CalRunnable(calTask.getStartPos(), calTask.getEndPos(), wordCache, indexCache, bitmap, queue));
         }
 
-        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
-        String aggFilePath = String.join(File.separator, dataDirPath, "长尾词_聚合结果_临时.txt");
+        String aggFilePath = String.join(File.separator, dataDirPath, Constant.WORD_AGG_RESULT_TEMP_FILE);
         ProgressBarBuilder progressBarBuilder = new ProgressBarBuilder().setTaskName("文本聚合计算")
-                .setInitialMax(totalWord)
+                .setInitialMax(wordTotalNum)
                 .setUnit("个", 1)
                 .setSpeedUnit(ChronoUnit.SECONDS)
                 .showSpeed();
@@ -204,4 +143,32 @@ public class Agg {
         }
         return calTasks;
     }
+
+    private long[] getLineContentIndex(int wordTotalNum, String filePath) throws IOException {
+        BufferedRandomAccessFile randomAccessFile = new BufferedRandomAccessFile(filePath, "r");
+        long[] lineContentIndex = new long[wordTotalNum+1];
+        for (int i = 1; i <= wordTotalNum; i++, randomAccessFile.readLine()) {
+            lineContentIndex[i] = randomAccessFile.getFilePointer();
+        }
+        return lineContentIndex;
+    }
+
+    private ConcurrentHashMap<String, long[]> getLienContentIndexMap(int stemTotalNum, String filePath) throws IOException {
+//        BufferedRandomAccessFile randomAccessFile = new BufferedRandomAccessFile(filePath, "r");
+        // TODO 这里太慢了
+        RandomAccessFile randomAccessFile = new RandomAccessFile(filePath, "r");
+        ConcurrentHashMap<String, long[]> lineContentIndexMap = new ConcurrentHashMap<>(stemTotalNum);
+        String line = null;
+        int cnt = 0;
+        while ((line = randomAccessFile.readLine()) != null) {
+            line = new String(line.getBytes("8859_1"), "UTF-8");
+            String stem = line.substring(0, line.indexOf(","));
+            long[] basicInfo = new long[2];
+            basicInfo[0] = randomAccessFile.getFilePointer();
+            basicInfo[1] = line.getBytes().length;
+            lineContentIndexMap.put(stem, basicInfo);
+            cnt++;
+        }
+        return lineContentIndexMap;
+    }
 }

+ 21 - 23
src/main/java/top/zhixinghe1/money/agg/entity/CalRunable.java → src/main/java/top/zhixinghe1/money/agg/CalRunnable.java

@@ -1,13 +1,14 @@
-package top.zhixinghe1.money.agg.entity;
+package top.zhixinghe1.money.agg;
 
+import com.github.benmanes.caffeine.cache.LoadingCache;
 import org.apache.commons.text.similarity.CosineSimilarity;
+import org.roaringbitmap.RoaringBitmap;
+import top.zhixinghe1.money.agg.entity.CalResult;
+import top.zhixinghe1.money.agg.entity.Word;
 
 import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.LinkedBlockingQueue;
@@ -15,17 +16,17 @@ import java.util.concurrent.LinkedBlockingQueue;
 /**
  * 计算任务对象
  */
-public class CalRunable implements Runnable {
+public class CalRunnable implements Runnable {
 
     private int start;
 
     private int end;
 
-    private Map<Integer, Word> wordCache = new HashMap();
+    private LoadingCache<Integer, Word> wordCache;
 
-    private Map<String, Set<Integer>> indexCache = new HashMap();
+    private LoadingCache<String, RoaringBitmap> indexCache;
 
-    private BitSet bitmap = null;
+    private RoaringBitmap bitmap;
 
     private CosineSimilarity cosineSimilarity = new CosineSimilarity();
 
@@ -33,10 +34,9 @@ public class CalRunable implements Runnable {
 
     private LinkedBlockingQueue<CalResult> queue;
 
-    private Set<Integer> indexSet = new HashSet<>();
     private List<String> result = new ArrayList<>();
 
-    public CalRunable(int start, int end, Map<Integer, Word> wordCache, Map<String, Set<Integer>> indexCache, BitSet bitmap, LinkedBlockingQueue<CalResult> queue) {
+    public CalRunnable(int start, int end, LoadingCache<Integer, Word> wordCache, LoadingCache<String, RoaringBitmap> indexCache, RoaringBitmap bitmap, LinkedBlockingQueue<CalResult> queue) {
         this.start = start;
         this.end = end;
         this.wordCache = wordCache;
@@ -49,7 +49,7 @@ public class CalRunable implements Runnable {
     public void run() {
         try {
             for (int i = start; i <= end; i++) {
-                CalResult calResult = null;
+                CalResult calResult;
                 if (cal(i)) {
                     calResult = new CalResult(true, new ArrayList<>(result));
                 } else {
@@ -65,31 +65,28 @@ public class CalRunable implements Runnable {
 
     private boolean cal(int i) {
         // 判断是否已进行计算
-        if (bitmap.get(i)) {
+        if (bitmap.contains(i)) {
             return false;
         }
 
         // 清除上一轮的数据
-        indexSet.clear();
         result.clear();
 
         Word word = wordCache.get(i);
         if (Objects.isNull(word.getStemMap()) || word.getStemMap().size() == 0) {
             return false;
         }
-        bitmap.set(i, true);
+        bitmap.add(i);
         result.add(word.getKey());
+
+        RoaringBitmap finalBitmap = new RoaringBitmap();
         for (CharSequence stem : word.getStemMap().keySet()) {
-            Set<Integer> positions = indexCache.get(stem);
-            for (Integer position : positions) {
-                if (bitmap.get(position)) {
-                    positions.remove(position);
-                } else {
-                    indexSet.add(position);
-                }
-            }
+            RoaringBitmap stemBitmap = indexCache.get((String) stem);
+            finalBitmap.or(stemBitmap);
         }
-        for (Integer index : indexSet) {
+        finalBitmap.andNot(bitmap);
+
+        for (Integer index : finalBitmap) {
             Word candicateWord = wordCache.get(index);
             if (Objects.isNull(candicateWord.getStemMap())) {
                 continue;
@@ -98,6 +95,7 @@ public class CalRunable implements Runnable {
             if (v < aggThreshold) {
                 continue;
             }
+            bitmap.add(index);
             result.add(candicateWord.getKey());
         }
         // 输出计算结果

+ 22 - 0
src/main/java/top/zhixinghe1/money/agg/Constant.java

@@ -0,0 +1,22 @@
+package top.zhixinghe1.money.agg;
+
+/**
+ * 常量
+ */
+public class Constant {
+
+    /**
+     * 长尾词分词文件
+     */
+    public static final String WORD_STEM_FILE = "长尾词_分词.txt";
+
+    /**
+     * 长尾词倒排索引文件
+     */
+    public static final String WORD_REVERSE_INDEX_FILE = "长尾词_倒排索引.txt";
+
+    /**
+     * 长尾词聚合结果临时文件
+     */
+    public static final String WORD_AGG_RESULT_TEMP_FILE = "长尾词_聚合结果_临时.txt";
+}

+ 18 - 0
src/main/java/top/zhixinghe1/money/agg/Test.java

@@ -0,0 +1,18 @@
+package top.zhixinghe1.money.agg;
+
+import org.roaringbitmap.ImmutableBitmapDataProvider;
+import org.roaringbitmap.IntConsumer;
+import org.roaringbitmap.RoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+import org.roaringbitmap.longlong.IntegerUtil;
+
+import java.util.HashMap;
+
+public class Test {
+    public static void main(String[] args) {
+        RoaringBitmap integers = RoaringBitmap.bitmapOf(1, 3, 7, 10);
+        for (Integer integer : integers) {
+            System.out.println(integer);
+        }
+    }
+}

+ 68 - 0
src/main/java/top/zhixinghe1/money/agg/cache/ReverseIndexLoader.java

@@ -0,0 +1,68 @@
+package top.zhixinghe1.money.agg.cache;
+
+import com.github.benmanes.caffeine.cache.CacheLoader;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.roaringbitmap.RoaringBitmap;
+import top.zhixinghe1.money.agg.Constant;
+import top.zhixinghe1.money.agg.entity.BufferedRandomAccessFile;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Map;
+
+public class ReverseIndexLoader implements CacheLoader<String, RoaringBitmap> {
+
+    /**
+     * 数据目录路径
+     */
+    private String dataPath;
+
+    /**
+     * 行索引
+     */
+    private Map<String, long[]> lineIndex;
+
+    /**
+     * 文件读写器
+     */
+//    private BufferedRandomAccessFile randomAccessFile;
+    private RandomAccessFile randomAccessFile;
+
+    private MappedByteBuffer mappedByteBuffer;
+
+    public ReverseIndexLoader(String dataPath, Map<String, long[]> lineIndex) throws IOException {
+        this.dataPath = dataPath;
+        this.lineIndex = lineIndex;
+        String filePath = String.join(File.separator, dataPath, Constant.WORD_REVERSE_INDEX_FILE);
+//        randomAccessFile = new BufferedRandomAccessFile(filePath, "r", 10*1024*1024);
+        randomAccessFile = new RandomAccessFile(filePath, "r");
+        MappedByteBuffer mappedByteBuffer = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, Integer.MAX_VALUE);
+    }
+
+    @Override
+    public @Nullable RoaringBitmap load(String s) throws Exception {
+        long[] basicInfo = lineIndex.get(s);
+//        randomAccessFile.seek();
+//        String lineContent = new String(randomAccessFile.readLine().getBytes("8859_1"), "UTF-8");
+//        String lineContent = randomAccessFile.readUTF();
+//        mappedByteBuffer.position((int) basicInfo[0]);
+        byte[] buff = new byte[(int) basicInfo[2]];
+        mappedByteBuffer.put(buff, (int)basicInfo[0], (int)basicInfo[2]);
+        String lineContent = new String(buff);
+
+        String[] split = lineContent.substring(lineContent.indexOf(",") + 1).split(",");
+        int[] positionIntegers = new int[split.length];
+        for (int i = 0; i < split.length; i++) {
+            try {
+                positionIntegers[i] = Integer.parseInt(split[i]);
+            } catch (Exception e) {
+                System.out.println("暂停");
+            }
+        }
+
+        return RoaringBitmap.bitmapOf(positionIntegers);
+    }
+}

+ 72 - 0
src/main/java/top/zhixinghe1/money/agg/cache/WordLoader.java

@@ -0,0 +1,72 @@
+package top.zhixinghe1.money.agg.cache;
+
+import com.github.benmanes.caffeine.cache.CacheLoader;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import top.zhixinghe1.money.agg.Constant;
+import top.zhixinghe1.money.agg.entity.BufferedRandomAccessFile;
+import top.zhixinghe1.money.agg.entity.Word;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+public class WordLoader implements CacheLoader<Integer, Word> {
+
+    /**
+     * 数据目录路径
+     */
+    private String dataPath;
+
+    /**
+     * 行索引
+     */
+    private long[] lineIndex;
+
+    /**
+     * 文件读写器
+     */
+    private BufferedRandomAccessFile randomAccessFile;
+
+    public WordLoader(String dataPath, long[] lineIndex) throws IOException {
+        this.dataPath = dataPath;
+        this.lineIndex = lineIndex;
+        String filePath = String.join(File.separator, dataPath, Constant.WORD_STEM_FILE);
+        randomAccessFile = new BufferedRandomAccessFile(filePath, "r", 10*1024*1024);
+    }
+
+    @Override
+    public @Nullable Word load(Integer integer) throws Exception {
+        randomAccessFile.seek(lineIndex[integer]);
+        String lineContent = new String(randomAccessFile.readLine().getBytes("8859_1"), "UTF-8");
+
+        // 提取关键词和分词
+        int keyPosition = lineContent.indexOf(",");
+        String key = keyPosition == -1 ? lineContent : lineContent.substring(0, keyPosition);
+        Map<CharSequence, Integer> stemMap = Collections.EMPTY_MAP;
+        if (keyPosition != -1 && lineContent.length() != keyPosition + 1) {
+            stemMap = Arrays.asList(lineContent.substring(keyPosition+1).split(","))
+                    .stream().collect(Collectors.toMap(Function.identity(), v -> 1, Integer::sum));
+        }
+
+        return new Word(key, stemMap);
+    }
+
+    public static void main(String[] args) throws Exception {
+        int wordTotalNum = 500000;
+        String dataPath = "E:\\ChenYL\\CodeRepository\\money-mining-python\\data\\test\\长尾词聚合分析_20240131224320";
+        String filePath = String.join(File.separator, dataPath, Constant.WORD_STEM_FILE);
+        BufferedRandomAccessFile randomAccessFile = new BufferedRandomAccessFile(filePath, "r");
+        long[] lineContentIndex = new long[wordTotalNum+1];
+        for (int i = 1; i <= wordTotalNum; i++, randomAccessFile.readLine()) {
+            lineContentIndex[i] = randomAccessFile.getFilePointer();
+        }
+
+        WordLoader wordLoader = new WordLoader(dataPath, lineContentIndex);
+        Word load = wordLoader.load(1000);
+        System.out.println("暂停");
+    }
+}

+ 358 - 0
src/main/java/top/zhixinghe1/money/agg/entity/BufferedRandomAccessFile.java

@@ -0,0 +1,358 @@
+package top.zhixinghe1.money.agg.entity;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.Arrays;
+import java.util.logging.Logger;
+
+/**
+ * A <code>BufferedRandomAccessFile</code> is like a
+ * <code>RandomAccessFile</code>, but it uses a private buffer so that most
+ * operations do not require a disk access.
+ * <P>
+ *
+ * Note: The operations on this class are unmonitored. Also, the correct
+ * functioning of the <code>RandomAccessFile</code> methods that are not
+ * overridden here relies on the implementation of those methods in the
+ * superclass.
+ * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
+ */
+
+public final class BufferedRandomAccessFile extends RandomAccessFile
+{
+    static final int LogBuffSz_ = 16; // 64K buffer
+    public static final int BuffSz_ = (1 << LogBuffSz_);
+    static final long BuffMask_ = ~(((long) BuffSz_) - 1L);
+
+    /*
+     * This implementation is based on the buffer implementation in Modula-3's
+     * "Rd", "Wr", "RdClass", and "WrClass" interfaces.
+     */
+    private boolean dirty_; // true iff unflushed bytes exist
+    private boolean closed_; // true iff the file is closed
+    private long curr_; // current position in file
+    private long lo_, hi_; // bounds on characters in "buff"
+    private byte[] buff_; // local buffer
+    private long maxHi_; // this.lo + this.buff.length
+    private boolean hitEOF_; // buffer contains last file block?
+    private long diskPos_; // disk position
+
+    /*
+     * To describe the above fields, we introduce the following abstractions for
+     * the file "f":
+     *
+     * len(f) the length of the file curr(f) the current position in the file
+     * c(f) the abstract contents of the file disk(f) the contents of f's
+     * backing disk file closed(f) true iff the file is closed
+     *
+     * "curr(f)" is an index in the closed interval [0, len(f)]. "c(f)" is a
+     * character sequence of length "len(f)". "c(f)" and "disk(f)" may differ if
+     * "c(f)" contains unflushed writes not reflected in "disk(f)". The flush
+     * operation has the effect of making "disk(f)" identical to "c(f)".
+     *
+     * A file is said to be *valid* if the following conditions hold:
+     *
+     * V1. The "closed" and "curr" fields are correct:
+     *
+     * f.closed == closed(f) f.curr == curr(f)
+     *
+     * V2. The current position is either contained in the buffer, or just past
+     * the buffer:
+     *
+     * f.lo <= f.curr <= f.hi
+     *
+     * V3. Any (possibly) unflushed characters are stored in "f.buff":
+     *
+     * (forall i in [f.lo, f.curr): c(f)[i] == f.buff[i - f.lo])
+     *
+     * V4. For all characters not covered by V3, c(f) and disk(f) agree:
+     *
+     * (forall i in [f.lo, len(f)): i not in [f.lo, f.curr) => c(f)[i] ==
+     * disk(f)[i])
+     *
+     * V5. "f.dirty" is true iff the buffer contains bytes that should be
+     * flushed to the file; by V3 and V4, only part of the buffer can be dirty.
+     *
+     * f.dirty == (exists i in [f.lo, f.curr): c(f)[i] != f.buff[i - f.lo])
+     *
+     * V6. this.maxHi == this.lo + this.buff.length
+     *
+     * Note that "f.buff" can be "null" in a valid file, since the range of
+     * characters in V3 is empty when "f.lo == f.curr".
+     *
+     * A file is said to be *ready* if the buffer contains the current position,
+     * i.e., when:
+     *
+     * R1. !f.closed && f.buff != null && f.lo <= f.curr && f.curr < f.hi
+     *
+     * When a file is ready, reading or writing a single byte can be performed
+     * by reading or writing the in-memory buffer without performing a disk
+     * operation.
+     */
+
+    /**
+     * Open a new <code>BufferedRandomAccessFile</code> on <code>file</code>
+     * in mode <code>mode</code>, which should be "r" for reading only, or
+     * "rw" for reading and writing.
+     */
+    public BufferedRandomAccessFile(File file, String mode) throws IOException
+    {
+        super(file, mode);
+        this.init(0);
+    }
+
+    public BufferedRandomAccessFile(File file, String mode, int size) throws IOException
+    {
+        super(file, mode);
+        this.init(size);
+    }
+
+    /**
+     * Open a new <code>BufferedRandomAccessFile</code> on the file named
+     * <code>name</code> in mode <code>mode</code>, which should be "r" for
+     * reading only, or "rw" for reading and writing.
+     */
+    public BufferedRandomAccessFile(String name, String mode) throws IOException
+    {
+        super(name, mode);
+        this.init(0);
+    }
+
+    public BufferedRandomAccessFile(String name, String mode, int size) throws FileNotFoundException
+    {
+        super(name, mode);
+        this.init(size);
+    }
+
+    private void init(int size)
+    {
+        this.dirty_ = this.closed_ = false;
+        this.lo_ = this.curr_ = this.hi_ = 0;
+        this.buff_ = (size > BuffSz_) ? new byte[size] : new byte[BuffSz_];
+        this.maxHi_ = (long) BuffSz_;
+        this.hitEOF_ = false;
+        this.diskPos_ = 0L;
+    }
+
+    public void close() throws IOException
+    {
+        this.flush();
+        this.closed_ = true;
+        super.close();
+    }
+
+    /**
+     * Flush any bytes in the file's buffer that have not yet been written to
+     * disk. If the file was created read-only, this method is a no-op.
+     */
+    public void flush() throws IOException
+    {
+        this.flushBuffer();
+    }
+
+    /* Flush any dirty bytes in the buffer to disk. */
+    private void flushBuffer() throws IOException
+    {
+        if (this.dirty_)
+        {
+            if (this.diskPos_ != this.lo_)
+                super.seek(this.lo_);
+            int len = (int) (this.curr_ - this.lo_);
+            super.write(this.buff_, 0, len);
+            this.diskPos_ = this.curr_;
+            this.dirty_ = false;
+        }
+    }
+
+    /*
+     * Read at most "this.buff.length" bytes into "this.buff", returning the
+     * number of bytes read. If the return result is less than
+     * "this.buff.length", then EOF was read.
+     */
+    private int fillBuffer() throws IOException
+    {
+        int cnt = 0;
+        int rem = this.buff_.length;
+        while (rem > 0)
+        {
+            int n = super.read(this.buff_, cnt, rem);
+            if (n < 0)
+                break;
+            cnt += n;
+            rem -= n;
+        }
+        if ( (cnt < 0) && (this.hitEOF_ = (cnt < this.buff_.length)) )
+        {
+            // make sure buffer that wasn't read is initialized with -1
+            Arrays.fill(this.buff_, cnt, this.buff_.length, (byte) 0xff);
+        }
+        this.diskPos_ += cnt;
+        return cnt;
+    }
+
+    /*
+     * This method positions <code>this.curr</code> at position <code>pos</code>.
+     * If <code>pos</code> does not fall in the current buffer, it flushes the
+     * current buffer and loads the correct one.<p>
+     *
+     * On exit from this routine <code>this.curr == this.hi</code> iff <code>pos</code>
+     * is at or past the end-of-file, which can only happen if the file was
+     * opened in read-only mode.
+     */
+    public void seek(long pos) throws IOException
+    {
+        if (pos >= this.hi_ || pos < this.lo_)
+        {
+            // seeking outside of current buffer -- flush and read
+            this.flushBuffer();
+            this.lo_ = pos & BuffMask_; // start at BuffSz boundary
+            this.maxHi_ = this.lo_ + (long) this.buff_.length;
+            if (this.diskPos_ != this.lo_)
+            {
+                super.seek(this.lo_);
+                this.diskPos_ = this.lo_;
+            }
+            int n = this.fillBuffer();
+            this.hi_ = this.lo_ + (long) n;
+        }
+        else
+        {
+            // seeking inside current buffer -- no read required
+            if (pos < this.curr_)
+            {
+                // if seeking backwards, we must flush to maintain V4
+                this.flushBuffer();
+            }
+        }
+        this.curr_ = pos;
+    }
+
+    public long getFilePointer()
+    {
+        return this.curr_;
+    }
+
+    public long length() throws IOException
+    {
+        return Math.max(this.curr_, super.length());
+    }
+
+    public int read() throws IOException
+    {
+        if (this.curr_ >= this.hi_)
+        {
+            // test for EOF
+            // if (this.hi < this.maxHi) return -1;
+            if (this.hitEOF_)
+                return -1;
+
+            // slow path -- read another buffer
+            this.seek(this.curr_);
+            if (this.curr_ == this.hi_)
+                return -1;
+        }
+        byte res = this.buff_[(int) (this.curr_ - this.lo_)];
+        this.curr_++;
+        return ((int) res) & 0xFF; // convert byte -> int
+    }
+
+    public int read(byte[] b) throws IOException
+    {
+        return this.read(b, 0, b.length);
+    }
+
+    public int read(byte[] b, int off, int len) throws IOException
+    {
+        if (this.curr_ >= this.hi_)
+        {
+            // test for EOF
+            // if (this.hi < this.maxHi) return -1;
+            if (this.hitEOF_)
+                return -1;
+
+            // slow path -- read another buffer
+            this.seek(this.curr_);
+            if (this.curr_ == this.hi_)
+                return -1;
+        }
+        len = Math.min(len, (int) (this.hi_ - this.curr_));
+        int buffOff = (int) (this.curr_ - this.lo_);
+        System.arraycopy(this.buff_, buffOff, b, off, len);
+        this.curr_ += len;
+        return len;
+    }
+
+    public void write(int b) throws IOException
+    {
+        if (this.curr_ >= this.hi_)
+        {
+            if (this.hitEOF_ && this.hi_ < this.maxHi_)
+            {
+                // at EOF -- bump "hi"
+                this.hi_++;
+            }
+            else
+            {
+                // slow path -- write current buffer; read next one
+                this.seek(this.curr_);
+                if (this.curr_ == this.hi_)
+                {
+                    // appending to EOF -- bump "hi"
+                    this.hi_++;
+                }
+            }
+        }
+        this.buff_[(int) (this.curr_ - this.lo_)] = (byte) b;
+        this.curr_++;
+        this.dirty_ = true;
+    }
+
+    public void write(byte[] b) throws IOException
+    {
+        this.write(b, 0, b.length);
+    }
+
+    public void write(byte[] b, int off, int len) throws IOException
+    {
+        while (len > 0)
+        {
+            int n = this.writeAtMost(b, off, len);
+            off += n;
+            len -= n;
+            this.dirty_ = true;
+        }
+    }
+
+    /*
+     * Write at most "len" bytes to "b" starting at position "off", and return
+     * the number of bytes written.
+     */
+    private int writeAtMost(byte[] b, int off, int len) throws IOException
+    {
+        if (this.curr_ >= this.hi_)
+        {
+            if (this.hitEOF_ && this.hi_ < this.maxHi_)
+            {
+                // at EOF -- bump "hi"
+                this.hi_ = this.maxHi_;
+            }
+            else
+            {
+                // slow path -- write current buffer; read next one
+                this.seek(this.curr_);
+                if (this.curr_ == this.hi_)
+                {
+                    // appending to EOF -- bump "hi"
+                    this.hi_ = this.maxHi_;
+                }
+            }
+        }
+        len = Math.min(len, (int) (this.hi_ - this.curr_));
+        int buffOff = (int) (this.curr_ - this.lo_);
+        System.arraycopy(b, off, this.buff_, buffOff, len);
+        this.curr_ += len;
+        return len;
+    }
+}