-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Removed dependency on Lucene. Restricted blocking keys to lowercase.
- Loading branch information
Showing
138 changed files
with
218 additions
and
449 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,18 @@ | ||
/* | ||
* Copyright [2016] [George Papadakis ([email protected])] | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
* Copyright [2016] [George Papadakis ([email protected])] | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package BlockBuilding; | ||
|
||
import DataModel.AbstractBlock; | ||
|
@@ -22,34 +21,15 @@ | |
import DataModel.EntityProfile; | ||
import DataModel.UnilateralBlock; | ||
import Utilities.Converter; | ||
import java.io.IOException; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Map.Entry; | ||
import java.util.Set; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.core.SimpleAnalyzer; | ||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.document.Field; | ||
import org.apache.lucene.document.StoredField; | ||
import org.apache.lucene.document.StringField; | ||
import org.apache.lucene.index.DirectoryReader; | ||
import org.apache.lucene.index.Fields; | ||
import org.apache.lucene.index.IndexReader; | ||
import org.apache.lucene.index.IndexWriter; | ||
import org.apache.lucene.index.IndexWriterConfig; | ||
import org.apache.lucene.index.MultiFields; | ||
import org.apache.lucene.index.PostingsEnum; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.index.Terms; | ||
import org.apache.lucene.index.TermsEnum; | ||
import org.apache.lucene.search.DocIdSetIterator; | ||
import org.apache.lucene.store.Directory; | ||
import org.apache.lucene.store.RAMDirectory; | ||
import org.apache.lucene.util.BytesRef; | ||
|
||
/** | ||
* | ||
|
@@ -63,10 +43,10 @@ public abstract class AbstractBlockBuilding implements IBlockBuilding { | |
protected double noOfEntitiesD2; | ||
|
||
protected final List<AbstractBlock> blocks; | ||
protected Directory indexDirectoryD1; | ||
protected Directory indexDirectoryD2; | ||
protected List<EntityProfile> entityProfilesD1; | ||
protected List<EntityProfile> entityProfilesD2; | ||
protected Map<String, List<Integer>> invertedIndexD1; | ||
protected Map<String, List<Integer>> invertedIndexD2; | ||
|
||
public AbstractBlockBuilding() { | ||
blocks = new ArrayList<>(); | ||
|
@@ -75,32 +55,10 @@ public AbstractBlockBuilding() { | |
} | ||
|
||
protected void buildBlocks() { | ||
setMemoryDirectory(); | ||
indexEntities(invertedIndexD1, entityProfilesD1); | ||
|
||
IndexWriter iWriter1 = openWriter(indexDirectoryD1); | ||
indexEntities(iWriter1, entityProfilesD1); | ||
closeWriter(iWriter1); | ||
|
||
if (indexDirectoryD2 != null) { | ||
IndexWriter iWriter2 = openWriter(indexDirectoryD2); | ||
indexEntities(iWriter2, entityProfilesD2); | ||
closeWriter(iWriter2); | ||
} | ||
} | ||
|
||
protected void closeReader(IndexReader iReader) { | ||
try { | ||
iReader.close(); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
} | ||
} | ||
|
||
protected void closeWriter(IndexWriter iWriter) { | ||
try { | ||
iWriter.close(); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
if (invertedIndexD2 != null) { | ||
indexEntities(invertedIndexD2, entityProfilesD2); | ||
} | ||
} | ||
|
||
|
@@ -110,7 +68,7 @@ protected void closeWriter(IndexWriter iWriter) { | |
public List<AbstractBlock> getBlocks(List<EntityProfile> profiles) { | ||
return this.getBlocks(profiles, null); | ||
} | ||
|
||
@Override | ||
public List<AbstractBlock> getBlocks(List<EntityProfile> profilesD1, | ||
List<EntityProfile> profilesD2) { | ||
|
@@ -120,9 +78,11 @@ public List<AbstractBlock> getBlocks(List<EntityProfile> profilesD1, | |
return null; | ||
} | ||
|
||
invertedIndexD1 = new HashMap<>(); | ||
entityProfilesD1 = profilesD1; | ||
noOfEntitiesD1 = entityProfilesD1.size(); | ||
if (profilesD2 != null) { | ||
invertedIndexD2 = new HashMap<>(); | ||
entityProfilesD2 = profilesD2; | ||
noOfEntitiesD2 = entityProfilesD2.size(); | ||
} | ||
|
@@ -138,179 +98,79 @@ public double getBruteForceComparisons() { | |
return noOfEntitiesD1 * noOfEntitiesD2; | ||
} | ||
|
||
protected int[] getDocumentIds(IndexReader reader) { | ||
int[] documentIds = new int[reader.numDocs()]; | ||
for (int i = 0; i < documentIds.length; i++) { | ||
try { | ||
Document document = reader.document(i); | ||
documentIds[i] = Integer.parseInt(document.get(DOC_ID)); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
} | ||
} | ||
return documentIds; | ||
} | ||
|
||
public double getTotalNoOfEntities() { | ||
if (entityProfilesD2 == null) { | ||
return noOfEntitiesD1; | ||
} | ||
return noOfEntitiesD1 + noOfEntitiesD2; | ||
} | ||
|
||
protected void indexEntities(IndexWriter index, List<EntityProfile> entities) { | ||
try { | ||
int counter = 0; | ||
for (EntityProfile profile : entities) { | ||
Document doc = new Document(); | ||
doc.add(new StoredField(DOC_ID, counter++)); | ||
for (Attribute attribute : profile.getAttributes()) { | ||
getBlockingKeys(attribute.getValue()).stream().filter((key) -> (0 < key.trim().length())).forEach((key) -> { | ||
doc.add(new StringField(VALUE_LABEL, key.trim(), Field.Store.YES)); | ||
}); | ||
protected void indexEntities(Map<String, List<Integer>> index, List<EntityProfile> entities) { | ||
int counter = 0; | ||
for (EntityProfile profile : entities) { | ||
for (Attribute attribute : profile.getAttributes()) { | ||
Set<String> keys = getBlockingKeys(attribute.getValue()); | ||
for (String key : keys) { | ||
String normalizedKey = key.trim().toLowerCase(); | ||
if (0 < normalizedKey.length()) { | ||
List<Integer> entityList = index.get(normalizedKey); | ||
if (entityList == null) { | ||
entityList = new ArrayList<>(); | ||
index.put(normalizedKey, entityList); | ||
} | ||
entityList.add(counter); | ||
} | ||
} | ||
index.addDocument(doc); | ||
} | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
counter++; | ||
} | ||
} | ||
|
||
public static IndexReader openReader(Directory directory) { | ||
try { | ||
return DirectoryReader.open(directory); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
return null; | ||
} | ||
} | ||
|
||
protected IndexWriter openWriter(Directory directory) { | ||
try { | ||
Analyzer analyzer = new SimpleAnalyzer(); | ||
IndexWriterConfig config = new IndexWriterConfig(analyzer); | ||
return new IndexWriter(directory, config); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
return null; | ||
} | ||
} | ||
|
||
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { | ||
try { | ||
int[] documentIds = getDocumentIds(d1Index); | ||
final Map<String, int[]> hashedBlocks = new HashMap<>(); | ||
Fields fields = MultiFields.getFields(d1Index); | ||
for (String field : fields) { | ||
Terms terms = fields.terms(field); | ||
TermsEnum termsEnum = terms.iterator(); | ||
BytesRef text; | ||
while ((text = termsEnum.next()) != null) { | ||
// check whether it is a common term | ||
int d2DocFrequency = d2Index.docFreq(new Term(field, text)); | ||
if (d2DocFrequency == 0) { | ||
continue; | ||
} | ||
|
||
final List<Integer> entityIds = new ArrayList<>(); | ||
PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); | ||
int doc; | ||
while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { | ||
entityIds.add(documentIds[doc]); | ||
} | ||
|
||
int[] idsArray = Converter.convertCollectionToArray(entityIds); | ||
hashedBlocks.put(text.utf8ToString(), idsArray); | ||
} | ||
protected Map<String, int[]> parseD1Index() { | ||
final Map<String, int[]> hashedBlocks = new HashMap<>(); | ||
for (Entry<String, List<Integer>> entry : invertedIndexD1.entrySet()) { | ||
// check whether it is a common term | ||
if (!invertedIndexD2.containsKey(entry.getKey())) { | ||
continue; | ||
} | ||
return hashedBlocks; | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
return null; | ||
|
||
int[] idsArray = Converter.convertCollectionToArray(entry.getValue()); | ||
hashedBlocks.put(entry.getKey(), idsArray); | ||
} | ||
return hashedBlocks; | ||
} | ||
|
||
protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) { | ||
try { | ||
int[] documentIds = getDocumentIds(d2Index); | ||
Fields fields = MultiFields.getFields(d2Index); | ||
for (String field : fields) { | ||
Terms terms = fields.terms(field); | ||
TermsEnum termsEnum = terms.iterator(); | ||
BytesRef text; | ||
while ((text = termsEnum.next()) != null) { | ||
if (!hashedBlocks.containsKey(text.utf8ToString())) { | ||
continue; | ||
} | ||
|
||
final List<Integer> entityIds = new ArrayList<>(); | ||
PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text); | ||
int doc; | ||
while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { | ||
entityIds.add(documentIds[doc]); | ||
} | ||
|
||
int[] idsArray = Converter.convertCollectionToArray(entityIds); | ||
int[] d1Entities = hashedBlocks.get(text.utf8ToString()); | ||
blocks.add(new BilateralBlock(d1Entities, idsArray)); | ||
} | ||
protected void parseD2Index(Map<String, int[]> hashedBlocks) { | ||
for (Entry<String, List<Integer>> entry : invertedIndexD2.entrySet()) { | ||
if (!hashedBlocks.containsKey(entry.getKey())) { | ||
continue; | ||
} | ||
|
||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
int[] idsArray = Converter.convertCollectionToArray(entry.getValue()); | ||
int[] d1Entities = hashedBlocks.get(entry.getKey()); | ||
blocks.add(new BilateralBlock(d1Entities, idsArray)); | ||
} | ||
} | ||
|
||
protected void parseIndex(IndexReader d1Index) { | ||
try { | ||
int[] documentIds = getDocumentIds(d1Index); | ||
Fields fields = MultiFields.getFields(d1Index); | ||
for (String field : fields) { | ||
Terms terms = fields.terms(field); | ||
TermsEnum termsEnum = terms.iterator(); | ||
BytesRef text; | ||
while ((text = termsEnum.next()) != null) { | ||
if (termsEnum.docFreq() < 2) { | ||
continue; | ||
} | ||
|
||
final List<Integer> entityIds = new ArrayList<>(); | ||
PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); | ||
int doc; | ||
while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { | ||
entityIds.add(documentIds[doc]); | ||
} | ||
|
||
int[] idsArray = Converter.convertCollectionToArray(entityIds); | ||
UnilateralBlock block = new UnilateralBlock(idsArray); | ||
blocks.add(block); | ||
} | ||
protected void parseIndex() { | ||
for (List<Integer> entityList : invertedIndexD1.values()) { | ||
if (1 < entityList.size()) { | ||
int[] idsArray = Converter.convertCollectionToArray(entityList); | ||
UnilateralBlock block = new UnilateralBlock(idsArray); | ||
blocks.add(block); | ||
} | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.SEVERE, null, ex); | ||
} | ||
} | ||
|
||
//read blocks from Lucene index | ||
public List<AbstractBlock> readBlocks() { | ||
IndexReader iReaderD1 = openReader(indexDirectoryD1); | ||
if (entityProfilesD2 == null) { //Dirty ER | ||
parseIndex(iReaderD1); | ||
parseIndex(); | ||
} else { | ||
IndexReader iReaderD2 = openReader(indexDirectoryD2); | ||
Map<String, int[]> hashedBlocks = parseD1Index(iReaderD1, iReaderD2); | ||
parseD2Index(iReaderD2, hashedBlocks); | ||
closeReader(iReaderD2); | ||
Map<String, int[]> hashedBlocks = parseD1Index(); | ||
parseD2Index(hashedBlocks); | ||
} | ||
closeReader(iReaderD1); | ||
|
||
return blocks; | ||
} | ||
|
||
protected void setMemoryDirectory() { | ||
indexDirectoryD1 = new RAMDirectory(); | ||
if (entityProfilesD2 != null) { | ||
indexDirectoryD2 = new RAMDirectory(); | ||
} | ||
return blocks; | ||
} | ||
} | ||
} |
Oops, something went wrong.