|
|
@@ -16,6 +16,7 @@ import java.util.List;
|
|
|
import java.util.Map;
|
|
|
import java.util.Objects;
|
|
|
import java.util.Set;
|
|
|
+import java.util.concurrent.LinkedBlockingQueue;
|
|
|
|
|
|
public class CalRunable implements Runnable {
|
|
|
|
|
|
@@ -33,91 +34,81 @@ public class CalRunable implements Runnable {
|
|
|
|
|
|
private BitSet bitmap = null;
|
|
|
|
|
|
- private static final ThreadLocal<BufferedWriter> threadLocal = new ThreadLocal();
|
|
|
-
|
|
|
private CosineSimilarity cosineSimilarity = new CosineSimilarity();
|
|
|
|
|
|
private Double aggThreshold = 0.8;
|
|
|
|
|
|
- public CalRunable(int start, int end, String dataDirPath, Map<Integer, Word> wordCache, Map<String, Set<Integer>> indexCache, BitSet bitmap) {
|
|
|
+ private LinkedBlockingQueue<CalResult> queue;
|
|
|
+
|
|
|
+ private Set<Integer> indexSet = new HashSet<>();
|
|
|
+ private List<String> result = new ArrayList<>();
|
|
|
+
|
|
|
+ public CalRunable(int start, int end, String dataDirPath, Map<Integer, Word> wordCache, Map<String, Set<Integer>> indexCache, BitSet bitmap, LinkedBlockingQueue<CalResult> queue) {
|
|
|
this.start = start;
|
|
|
this.end = end;
|
|
|
this.dataDirPath = dataDirPath;
|
|
|
this.wordCache = wordCache;
|
|
|
this.indexCache = indexCache;
|
|
|
this.bitmap = bitmap;
|
|
|
+ this.queue = queue;
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public void run() {
|
|
|
try {
|
|
|
- BufferedWriter bufferedWriter = threadLocal.get();
|
|
|
- if (Objects.isNull(bufferedWriter)) {
|
|
|
- String aggFilePath = String.join(File.separator, dataDirPath, String.format("长尾词_合并_聚合_%s.txt", Thread.currentThread().getId()));
|
|
|
- try {
|
|
|
- FileWriter fileWriter = new FileWriter(new File(aggFilePath));
|
|
|
- bufferedWriter = new BufferedWriter(fileWriter);
|
|
|
- threadLocal.set(bufferedWriter);
|
|
|
- } catch (IOException e) {
|
|
|
- throw new RuntimeException(e);
|
|
|
- }
|
|
|
- }
|
|
|
- Set<Integer> indexSet = new HashSet<>();
|
|
|
- List<String> result = new ArrayList<>();
|
|
|
- try (ProgressBar pb = new ProgressBar(String.format("线程-%s 文本聚合计算", Thread.currentThread().getId()), end-start+1+1)) {
|
|
|
- for (int i = start; i <= end; i++) {
|
|
|
- // 更新发呆
|
|
|
- pb.step();
|
|
|
-
|
|
|
- if (bitmap.get(i)) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- Word word = wordCache.get(i);
|
|
|
- if (Objects.isNull(word.getStemMap()) || word.getStemMap().size() == 0) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- bitmap.set(i, true);
|
|
|
- result.add(word.getKey());
|
|
|
-
|
|
|
- for (CharSequence stem : word.getStemMap().keySet()) {
|
|
|
- Set<Integer> positions = indexCache.get(stem);
|
|
|
- for (Integer position : positions) {
|
|
|
- if (bitmap.get(position)) {
|
|
|
- positions.remove(position);
|
|
|
- } else {
|
|
|
- indexSet.add(position);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- for (Integer index : indexSet) {
|
|
|
- Word candicateWord = wordCache.get(index);
|
|
|
- if (Objects.isNull(candicateWord.getStemMap())) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- Double v = cosineSimilarity.cosineSimilarity(word.getStemMap(), candicateWord.getStemMap());
|
|
|
- if (v < aggThreshold) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- result.add(candicateWord.getKey());
|
|
|
- }
|
|
|
-
|
|
|
- // 输出计算结果
|
|
|
- if (result.size() == 1) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- for (String s : result) {
|
|
|
- bufferedWriter.write(s);
|
|
|
- bufferedWriter.write("\n");
|
|
|
- }
|
|
|
- bufferedWriter.write("\n");
|
|
|
+ for (int i = start; i <= end; i++) {
|
|
|
+ CalResult calResult = null;
|
|
|
+ if (cal(i)) {
|
|
|
+ calResult = new CalResult(true, new ArrayList<>(result));
|
|
|
+ } else {
|
|
|
+ calResult = new CalResult(false, null);
|
|
|
}
|
|
|
+ calResult.setEndStatus(i == end);
|
|
|
+ queue.put(calResult);
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ private boolean cal(int i) {
|
|
|
+ // 判断是否已进行计算
|
|
|
+ if (bitmap.get(i)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 清除上一轮的数据
|
|
|
+ indexSet.clear();
|
|
|
+ result.clear();
|
|
|
+
|
|
|
+ Word word = wordCache.get(i);
|
|
|
+ if (Objects.isNull(word.getStemMap()) || word.getStemMap().size() == 0) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ bitmap.set(i, true);
|
|
|
+ result.add(word.getKey());
|
|
|
+ for (CharSequence stem : word.getStemMap().keySet()) {
|
|
|
+ Set<Integer> positions = indexCache.get(stem);
|
|
|
+ for (Integer position : positions) {
|
|
|
+ if (bitmap.get(position)) {
|
|
|
+ positions.remove(position);
|
|
|
+ } else {
|
|
|
+ indexSet.add(position);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (Integer index : indexSet) {
|
|
|
+ Word candicateWord = wordCache.get(index);
|
|
|
+ if (Objects.isNull(candicateWord.getStemMap())) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ Double v = cosineSimilarity.cosineSimilarity(word.getStemMap(), candicateWord.getStemMap());
|
|
|
+ if (v < aggThreshold) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ result.add(candicateWord.getKey());
|
|
|
+ }
|
|
|
+ // 输出计算结果
|
|
|
+ return result.size() > 1;
|
|
|
+ }
|
|
|
}
|