Skip to content

Commit

Permalink
Removed dependency on Lucene. Restricted blocking keys to lowercase.
Browse files Browse the repository at this point in the history
  • Loading branch information
gpapadis committed Jun 22, 2017
1 parent ff37d72 commit 85216aa
Show file tree
Hide file tree
Showing 138 changed files with 218 additions and 449 deletions.
14 changes: 0 additions & 14 deletions jedai-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,6 @@
<version>3.1.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.0.1</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-common -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>6.0.1</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.opencsv/opencsv -->
<dependency>
<groupId>com.opencsv</groupId>
Expand Down
278 changes: 69 additions & 209 deletions jedai-core/src/main/java/BlockBuilding/AbstractBlockBuilding.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/*
* Copyright [2016] [George Papadakis ([email protected])]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

* Copyright [2016] [George Papadakis ([email protected])]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package BlockBuilding;

import DataModel.AbstractBlock;
Expand All @@ -22,34 +21,15 @@
import DataModel.EntityProfile;
import DataModel.UnilateralBlock;
import Utilities.Converter;
import java.io.IOException;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;

/**
*
Expand All @@ -63,10 +43,10 @@ public abstract class AbstractBlockBuilding implements IBlockBuilding {
protected double noOfEntitiesD2;

protected final List<AbstractBlock> blocks;
protected Directory indexDirectoryD1;
protected Directory indexDirectoryD2;
protected List<EntityProfile> entityProfilesD1;
protected List<EntityProfile> entityProfilesD2;
protected Map<String, List<Integer>> invertedIndexD1;
protected Map<String, List<Integer>> invertedIndexD2;

public AbstractBlockBuilding() {
blocks = new ArrayList<>();
Expand All @@ -75,32 +55,10 @@ public AbstractBlockBuilding() {
}

protected void buildBlocks() {
setMemoryDirectory();
indexEntities(invertedIndexD1, entityProfilesD1);

IndexWriter iWriter1 = openWriter(indexDirectoryD1);
indexEntities(iWriter1, entityProfilesD1);
closeWriter(iWriter1);

if (indexDirectoryD2 != null) {
IndexWriter iWriter2 = openWriter(indexDirectoryD2);
indexEntities(iWriter2, entityProfilesD2);
closeWriter(iWriter2);
}
}

protected void closeReader(IndexReader iReader) {
try {
iReader.close();
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
}

protected void closeWriter(IndexWriter iWriter) {
try {
iWriter.close();
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
if (invertedIndexD2 != null) {
indexEntities(invertedIndexD2, entityProfilesD2);
}
}

Expand All @@ -110,7 +68,7 @@ protected void closeWriter(IndexWriter iWriter) {
public List<AbstractBlock> getBlocks(List<EntityProfile> profiles) {
return this.getBlocks(profiles, null);
}

@Override
public List<AbstractBlock> getBlocks(List<EntityProfile> profilesD1,
List<EntityProfile> profilesD2) {
Expand All @@ -120,9 +78,11 @@ public List<AbstractBlock> getBlocks(List<EntityProfile> profilesD1,
return null;
}

invertedIndexD1 = new HashMap<>();
entityProfilesD1 = profilesD1;
noOfEntitiesD1 = entityProfilesD1.size();
if (profilesD2 != null) {
invertedIndexD2 = new HashMap<>();
entityProfilesD2 = profilesD2;
noOfEntitiesD2 = entityProfilesD2.size();
}
Expand All @@ -138,179 +98,79 @@ public double getBruteForceComparisons() {
return noOfEntitiesD1 * noOfEntitiesD2;
}

protected int[] getDocumentIds(IndexReader reader) {
int[] documentIds = new int[reader.numDocs()];
for (int i = 0; i < documentIds.length; i++) {
try {
Document document = reader.document(i);
documentIds[i] = Integer.parseInt(document.get(DOC_ID));
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
}
return documentIds;
}

public double getTotalNoOfEntities() {
if (entityProfilesD2 == null) {
return noOfEntitiesD1;
}
return noOfEntitiesD1 + noOfEntitiesD2;
}

protected void indexEntities(IndexWriter index, List<EntityProfile> entities) {
try {
int counter = 0;
for (EntityProfile profile : entities) {
Document doc = new Document();
doc.add(new StoredField(DOC_ID, counter++));
for (Attribute attribute : profile.getAttributes()) {
getBlockingKeys(attribute.getValue()).stream().filter((key) -> (0 < key.trim().length())).forEach((key) -> {
doc.add(new StringField(VALUE_LABEL, key.trim(), Field.Store.YES));
});
protected void indexEntities(Map<String, List<Integer>> index, List<EntityProfile> entities) {
int counter = 0;
for (EntityProfile profile : entities) {
for (Attribute attribute : profile.getAttributes()) {
Set<String> keys = getBlockingKeys(attribute.getValue());
for (String key : keys) {
String normalizedKey = key.trim().toLowerCase();
if (0 < normalizedKey.length()) {
List<Integer> entityList = index.get(normalizedKey);
if (entityList == null) {
entityList = new ArrayList<>();
index.put(normalizedKey, entityList);
}
entityList.add(counter);
}
}
index.addDocument(doc);
}
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
counter++;
}
}

public static IndexReader openReader(Directory directory) {
try {
return DirectoryReader.open(directory);
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
return null;
}
}

protected IndexWriter openWriter(Directory directory) {
try {
Analyzer analyzer = new SimpleAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
return new IndexWriter(directory, config);
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
return null;
}
}

protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) {
try {
int[] documentIds = getDocumentIds(d1Index);
final Map<String, int[]> hashedBlocks = new HashMap<>();
Fields fields = MultiFields.getFields(d1Index);
for (String field : fields) {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
// check whether it is a common term
int d2DocFrequency = d2Index.docFreq(new Term(field, text));
if (d2DocFrequency == 0) {
continue;
}

final List<Integer> entityIds = new ArrayList<>();
PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
int doc;
while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
entityIds.add(documentIds[doc]);
}

int[] idsArray = Converter.convertCollectionToArray(entityIds);
hashedBlocks.put(text.utf8ToString(), idsArray);
}
protected Map<String, int[]> parseD1Index() {
final Map<String, int[]> hashedBlocks = new HashMap<>();
for (Entry<String, List<Integer>> entry : invertedIndexD1.entrySet()) {
// check whether it is a common term
if (!invertedIndexD2.containsKey(entry.getKey())) {
continue;
}
return hashedBlocks;
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
return null;

int[] idsArray = Converter.convertCollectionToArray(entry.getValue());
hashedBlocks.put(entry.getKey(), idsArray);
}
return hashedBlocks;
}

protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) {
try {
int[] documentIds = getDocumentIds(d2Index);
Fields fields = MultiFields.getFields(d2Index);
for (String field : fields) {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
if (!hashedBlocks.containsKey(text.utf8ToString())) {
continue;
}

final List<Integer> entityIds = new ArrayList<>();
PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text);
int doc;
while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
entityIds.add(documentIds[doc]);
}

int[] idsArray = Converter.convertCollectionToArray(entityIds);
int[] d1Entities = hashedBlocks.get(text.utf8ToString());
blocks.add(new BilateralBlock(d1Entities, idsArray));
}
protected void parseD2Index(Map<String, int[]> hashedBlocks) {
for (Entry<String, List<Integer>> entry : invertedIndexD2.entrySet()) {
if (!hashedBlocks.containsKey(entry.getKey())) {
continue;
}

} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
int[] idsArray = Converter.convertCollectionToArray(entry.getValue());
int[] d1Entities = hashedBlocks.get(entry.getKey());
blocks.add(new BilateralBlock(d1Entities, idsArray));
}
}

protected void parseIndex(IndexReader d1Index) {
try {
int[] documentIds = getDocumentIds(d1Index);
Fields fields = MultiFields.getFields(d1Index);
for (String field : fields) {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
if (termsEnum.docFreq() < 2) {
continue;
}

final List<Integer> entityIds = new ArrayList<>();
PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
int doc;
while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
entityIds.add(documentIds[doc]);
}

int[] idsArray = Converter.convertCollectionToArray(entityIds);
UnilateralBlock block = new UnilateralBlock(idsArray);
blocks.add(block);
}
protected void parseIndex() {
for (List<Integer> entityList : invertedIndexD1.values()) {
if (1 < entityList.size()) {
int[] idsArray = Converter.convertCollectionToArray(entityList);
UnilateralBlock block = new UnilateralBlock(idsArray);
blocks.add(block);
}
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
}

//read blocks from Lucene index
public List<AbstractBlock> readBlocks() {
IndexReader iReaderD1 = openReader(indexDirectoryD1);
if (entityProfilesD2 == null) { //Dirty ER
parseIndex(iReaderD1);
parseIndex();
} else {
IndexReader iReaderD2 = openReader(indexDirectoryD2);
Map<String, int[]> hashedBlocks = parseD1Index(iReaderD1, iReaderD2);
parseD2Index(iReaderD2, hashedBlocks);
closeReader(iReaderD2);
Map<String, int[]> hashedBlocks = parseD1Index();
parseD2Index(hashedBlocks);
}
closeReader(iReaderD1);

return blocks;
}

protected void setMemoryDirectory() {
indexDirectoryD1 = new RAMDirectory();
if (entityProfilesD2 != null) {
indexDirectoryD2 = new RAMDirectory();
}
return blocks;
}
}
}
Loading

0 comments on commit 85216aa

Please sign in to comment.