Skip to content

Commit

Permalink
Extended CLI with support for multiple block building methods.
Browse files Browse the repository at this point in the history
  • Loading branch information
gpapadis committed May 17, 2019
1 parent fb124bb commit e2e4046
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
* @author G.A.P. II
*/
public enum BlockBuildingMethod {

EXTENDED_Q_GRAMS_BLOCKING,
EXTENDED_SORTED_NEIGHBORHOOD,
EXTENDED_SUFFIX_ARRAYS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import org.apache.log4j.BasicConfigurator;
Expand Down Expand Up @@ -50,9 +51,9 @@
*
* @author GAP2
*/
public class Main {
public class BlockingBasedWorkflow {

private final static String MAIN_DIR_CCER_DATASETS = System.getProperty("user.dir") + File.separator +"data" + File.separator + "cleanCleanErDatasets" + File.separator;
private final static String MAIN_DIR_CCER_DATASETS = System.getProperty("user.dir") + File.separator + "data" + File.separator + "cleanCleanErDatasets" + File.separator;
private final static String MAIN_DIR_DER_DATASETS = System.getProperty("user.dir") + File.separator + "data" + File.separator + "dirtyErDatasets" + File.separator;
private final static String[] CCER_ENTITY_FILEPATHS = {"abtProfiles", "buyProfiles",
"dblpProfiles", "acmProfiles",
Expand All @@ -72,16 +73,20 @@ public class Main {
private final static String[] CCER_DATASETS = {"Abt-Buy", "DBLP-ACM", "DBLP-Scholar", "Amazon-Google Products", "IMDB-DBPedia Movies"};
private final static String[] BLOCK_BUILDING_METHODS = {"Extended Q-Grams Blocking", "Extended Sorted Neighborhood", "Extended Suffix Arrays Blocking", "LSH Minhash Blocking", "LSH Superbit Blocking", "Q-Grams Blocking", "Sorted Neighborhood", "Standard/Token Blocking", "Suffix Arrays Blocking"};
private final static String[] BLOCK_CLEANING_METHODS = {"Block Filtering", "Comparison-based Block Purging", "Size-based Block Purging"};
private final static String[] COMPARISON_CLEANING_METHODS = {"Canopy Clustering", "Cardinality Edge Pruning", "Cardinality Node Pruning", "Comparison Propagation", "Extended Canopy Clustering", "Reciprocal Cardinality Node Pruning", "Reciprocal Weighed Node Pruning", "Weighed Edge Pruning", "Weighed Node Pruning"};
private final static String[] COMPARISON_CLEANING_METHODS = {"Blast", "Canopy Clustering", "Cardinality Edge Pruning", "Cardinality Node Pruning", "Comparison Propagation", "Extended Canopy Clustering", "Reciprocal Cardinality Node Pruning", "Reciprocal Weighed Node Pruning", "Weighed Edge Pruning", "Weighed Node Pruning"};
private final static String[] ENTITY_MATCHING_METHODS = {"Group Linkage", "Profile Matcher"};
private final static String[] DIRTY_ER_ENTITY_CLUSTERING_METHODS = {"Center Clustering", "Connected Components Clustering", "Cut Clustering", "Markov Clustering", "Merge-Center Clustering", "Ricochet SR Clustering", "Correlation Clustering"};

private static TIntList readMultipleInt(String message, String[] array) {
private static TIntList readMultipleInt(boolean optional, String message, String[] array) {
System.out.println("\n\n" + message);
for (int i = 0; i < array.length; i++) {
System.out.println((i + 1) + " - " + array[i]);
}
System.out.println("This is an optional step. You can select none or all options. Choose -1 to terminate this step!");
if (optional) {
System.out.println("This is an optional step. You can select none or all options. Choose -1 to terminate this step!");
} else {
System.out.println("Please select one or more of the available options. Choose -1 to terminate this step!");
}

final TIntList selectedIds = new TIntArrayList();
while (true) {
Expand Down Expand Up @@ -198,14 +203,14 @@ private static int getDirtyErDataset() {
return readInt(message, DER_DATASETS);
}

private static int getBlockBuildingMethod() {
String message = "Please choose one of the available Block Building methods:";
return readInt(message, BLOCK_BUILDING_METHODS);
private static TIntList getBlockBuildingMethod() {
String message = "Please choose one or more of the available Block Building methods:";
return readMultipleInt(false, message, BLOCK_BUILDING_METHODS);
}

private static TIntList getBlockCleaningMethod() {
String message = "Please choose one, several or none of the available Block Cleaning methods:";
return readMultipleInt(message, BLOCK_CLEANING_METHODS);
return readMultipleInt(true, message, BLOCK_CLEANING_METHODS);
}

private static int getComparisonCleaningMethod() {
Expand Down Expand Up @@ -245,8 +250,7 @@ public static void main(String[] args) {
profilesD2 = eReader2.getEntityProfiles();
System.out.println("Input Entity Profiles D2\t:\t" + profilesD2.size());

final IGroundTruthReader gtReader = new GtSerializationReader(MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS[datasetId - 12
]);
final IGroundTruthReader gtReader = new GtSerializationReader(MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS[datasetId - 1]);
duplicatePropagation = new BilateralDuplicatePropagation(gtReader.getDuplicatePairs(null));
System.out.println("Existing Duplicates\t:\t" + duplicatePropagation.getDuplicates().size());
} else {
Expand All @@ -266,41 +270,47 @@ public static void main(String[] args) {
final StringBuilder workflowName = new StringBuilder();

// Block Building
int bbMethodId = getBlockBuildingMethod();
double time1 = System.currentTimeMillis();
final TIntList bbMethodIds = getBlockBuildingMethod();
List<AbstractBlock> blocks = new ArrayList<>();

final IBlockBuilding blockBuildingMethod = BlockBuildingMethod.getDefaultConfiguration(BlockBuildingMethod.values()[bbMethodId - 1]);
List<AbstractBlock> blocks = blockBuildingMethod.getBlocks(profilesD1, profilesD2);
double totalTime = 0;
for (TIntIterator bbIterator = bbMethodIds.iterator(); bbIterator.hasNext();) {
double time1 = System.currentTimeMillis();

final IBlockBuilding blockBuildingMethod = BlockBuildingMethod.getDefaultConfiguration(BlockBuildingMethod.values()[bbIterator.next() - 1]);
blocks.addAll(blockBuildingMethod.getBlocks(profilesD1, profilesD2));

double time2 = System.currentTimeMillis();

workflowConf.append(blockBuildingMethod.getMethodConfiguration());
workflowName.append(blockBuildingMethod.getMethodName());
double time2 = System.currentTimeMillis();

totalTime += time2 - time1;
workflowConf.append(blockBuildingMethod.getMethodConfiguration()).append("\n");
workflowName.append(blockBuildingMethod.getMethodName()).append("->");
}

BlocksPerformance blStats = new BlocksPerformance(blocks, duplicatePropagation);
blStats.setStatistics();
blStats.printStatistics(time2 - time1, workflowConf.toString(), workflowName.toString());
blStats.printStatistics(totalTime, workflowConf.toString(), workflowName.toString());

// Block Cleaning
final TIntList bcMethodIds = getBlockCleaningMethod();
if (!bcMethodIds.isEmpty()) {
bcMethodIds.sort();
bcMethodIds.reverse();
final TIntIterator iterator = bcMethodIds.iterator();
while (iterator.hasNext()) {
for (TIntIterator bcIterator = bcMethodIds.iterator(); bcIterator.hasNext();) {
double time3 = System.currentTimeMillis();

final IBlockProcessing blockCleaningMethod = BlockCleaningMethod.getDefaultConfiguration(BlockCleaningMethod.values()[iterator.next() - 1]);
final IBlockProcessing blockCleaningMethod = BlockCleaningMethod.getDefaultConfiguration(BlockCleaningMethod.values()[bcIterator.next() - 1]);
blocks = blockCleaningMethod.refineBlocks(blocks);

double time4 = System.currentTimeMillis();

workflowConf.append("\n").append(blockCleaningMethod.getMethodConfiguration());
workflowName.append("->").append(blockCleaningMethod.getMethodName());
totalTime += time4- time3;
workflowConf.append(blockCleaningMethod.getMethodConfiguration()).append("\n");
workflowName.append(blockCleaningMethod.getMethodName()).append("->");

blStats = new BlocksPerformance(blocks, duplicatePropagation);
blStats.setStatistics();
blStats.printStatistics(time4 - time3, workflowConf.toString(), workflowName.toString());
blStats.printStatistics(totalTime, workflowConf.toString(), workflowName.toString());
}
}

Expand All @@ -314,12 +324,13 @@ public static void main(String[] args) {

double time6 = System.currentTimeMillis();

workflowConf.append("\n").append(comparisonCleaningMethod.getMethodConfiguration());
workflowName.append("->").append(comparisonCleaningMethod.getMethodName());
totalTime += time6 - time5;
workflowConf.append(comparisonCleaningMethod.getMethodConfiguration()).append("\n");
workflowName.append(comparisonCleaningMethod.getMethodName()).append("->");

blStats = new BlocksPerformance(blocks, duplicatePropagation);
blStats.setStatistics();
blStats.printStatistics(time6 - time5, blockBuildingMethod.getMethodConfiguration(), blockBuildingMethod.getMethodName());
blStats.printStatistics(totalTime, workflowConf.toString(), workflowName.toString());
}

// Entity Matching
Expand All @@ -331,8 +342,9 @@ public static void main(String[] args) {

double time8 = System.currentTimeMillis();

workflowConf.append("\n").append(entityMatchingMethod.getMethodConfiguration());
workflowName.append("->").append(entityMatchingMethod.getMethodName());
totalTime += time8- time7;
workflowConf.append(entityMatchingMethod.getMethodConfiguration()).append("\n");
workflowName.append(entityMatchingMethod.getMethodName()).append("->");
System.out.println("Entity Matching overhead time\t:\t" + (time8 - time7));

// Entity Clustering
Expand All @@ -351,11 +363,12 @@ public static void main(String[] args) {

long time10 = System.currentTimeMillis();

workflowConf.append("\n").append(entityClusteringMethod.getMethodConfiguration());
workflowName.append("->").append(entityClusteringMethod.getMethodName());
totalTime += time10 - time9;
workflowConf.append(entityClusteringMethod.getMethodConfiguration());
workflowName.append(entityClusteringMethod.getMethodName());

ClustersPerformance clp = new ClustersPerformance(entityClusters, duplicatePropagation);
clp.setStatistics();
clp.printStatistics(time10 - time9, workflowName.toString(), workflowConf.toString());
clp.printStatistics(totalTime, workflowName.toString(), workflowConf.toString());
}
}

0 comments on commit e2e4046

Please sign in to comment.