diff --git a/jedai-core/src/main/java/org/scify/jedai/blockprocessing/AbstractBlockProcessing.java b/jedai-core/src/main/java/org/scify/jedai/blockprocessing/AbstractBlockProcessing.java index ac44c2fc..fef316d2 100644 --- a/jedai-core/src/main/java/org/scify/jedai/blockprocessing/AbstractBlockProcessing.java +++ b/jedai-core/src/main/java/org/scify/jedai/blockprocessing/AbstractBlockProcessing.java @@ -35,7 +35,7 @@ protected void printOriginalStatistics(List inputBlocks) { comparisons+= block.getNoOfComparisons(); } - Log.info("Original blocks\t:\t{0}" + inputBlocks.size()); + Log.info("Original blocks\t:\t" + inputBlocks.size()); Log.info("Original comparisons\t:\t" + comparisons); } } diff --git a/jedai-core/src/main/java/org/scify/jedai/blockprocessing/blockcleaning/AbstractBlockPurging.java b/jedai-core/src/main/java/org/scify/jedai/blockprocessing/blockcleaning/AbstractBlockPurging.java index c2adcfb4..60c67cab 100644 --- a/jedai-core/src/main/java/org/scify/jedai/blockprocessing/blockcleaning/AbstractBlockPurging.java +++ b/jedai-core/src/main/java/org/scify/jedai/blockprocessing/blockcleaning/AbstractBlockPurging.java @@ -46,7 +46,7 @@ public List refineBlocks(List blocks) { final Iterator blocksIterator = blocks.iterator(); while (blocksIterator.hasNext()) { AbstractBlock aBlock = blocksIterator.next(); - if (satisfiesThreshold(aBlock)) { + if (!satisfiesThreshold(aBlock)) { noOfPurgedBlocks++; blocksIterator.remove(); } else { diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/BlocksPerformance.java b/jedai-core/src/main/java/org/scify/jedai/utilities/BlocksPerformance.java index 90b7d719..9b23b950 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/BlocksPerformance.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/BlocksPerformance.java @@ -272,7 +272,7 @@ private void setComparisonsCardinality() { public void setStatistics() { if (blocks.isEmpty()) { - Log.warn("Empty set of equivalence clusters given as input!"); + Log.warn("Empty set of blocks was given as input!"); return; } diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockBuildingMethod.java b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockBuildingMethod.java index 37f026b5..58f76623 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockBuildingMethod.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockBuildingMethod.java @@ -40,8 +40,8 @@ public enum BlockBuildingMethod { EXTENDED_SUFFIX_ARRAYS, Q_GRAMS_BLOCKING, SORTED_NEIGHBORHOOD, - SUFFIX_ARRAYS, - STANDARD_BLOCKING; + STANDARD_BLOCKING, + SUFFIX_ARRAYS; public static IBlockBuilding getDefaultConfiguration(BlockBuildingMethod blbuMethod) { switch (blbuMethod) { diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockCleaningMethod.java b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockCleaningMethod.java index e453aed0..bd06e164 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockCleaningMethod.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockCleaningMethod.java @@ -1,5 +1,5 @@ /* -* Copyright [2016-2017] [George Papadakis (gpapadis@yahoo.gr)] +* Copyright [2016-2018] [George Papadakis (gpapadis@yahoo.gr)] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/ComparisonCleaningMethod.java b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/ComparisonCleaningMethod.java index 011f3c05..1562b5ec 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/ComparisonCleaningMethod.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/ComparisonCleaningMethod.java @@ -1,5 +1,5 @@ /* -* Copyright [2016-2017] [George Papadakis (gpapadis@yahoo.gr)] +* Copyright [2016-2018] [George Papadakis (gpapadis@yahoo.gr)] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringCcerMethod.java b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringCcerMethod.java index fedcfcef..39f16742 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringCcerMethod.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringCcerMethod.java @@ -1,5 +1,5 @@ /* -* Copyright [2016-2017] [George Papadakis (gpapadis@yahoo.gr)] +* Copyright [2016-2018] [George Papadakis (gpapadis@yahoo.gr)] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringDerMethod.java b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringDerMethod.java index b4b3cd97..b3683d86 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringDerMethod.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityClusteringDerMethod.java @@ -1,5 +1,5 @@ /* -* Copyright [2016-2017] [George Papadakis (gpapadis@yahoo.gr)] +* Copyright [2016-2018] [George Papadakis (gpapadis@yahoo.gr)] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityMatchingMethod.java b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityMatchingMethod.java index 0b8b32d5..ca34b18b 100644 --- a/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityMatchingMethod.java +++ b/jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/EntityMatchingMethod.java @@ -1,5 +1,5 @@ /* -* Copyright [2016-2017] [George Papadakis (gpapadis@yahoo.gr)] +* Copyright [2016-2018] [George Papadakis (gpapadis@yahoo.gr)] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/jedai-core/src/main/java/org/scify/jedai/workflowbuilder/Main.java b/jedai-core/src/main/java/org/scify/jedai/workflowbuilder/Main.java new file mode 100644 index 00000000..62a80386 --- /dev/null +++ b/jedai-core/src/main/java/org/scify/jedai/workflowbuilder/Main.java @@ -0,0 +1,361 @@ +/* +* Copyright [2016-2018] [George Papadakis (gpapadis@yahoo.gr)] +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. + */ +package org.scify.jedai.workflowbuilder; + +import gnu.trove.iterator.TIntIterator; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; +import java.io.File; +import java.util.List; +import java.util.Scanner; +import org.apache.log4j.BasicConfigurator; +import org.scify.jedai.blockbuilding.IBlockBuilding; +import org.scify.jedai.blockprocessing.IBlockProcessing; +import org.scify.jedai.datamodel.AbstractBlock; +import org.scify.jedai.datamodel.EntityProfile; +import org.scify.jedai.datamodel.EquivalenceCluster; +import org.scify.jedai.datamodel.SimilarityPairs; +import org.scify.jedai.datareader.entityreader.EntitySerializationReader; +import org.scify.jedai.datareader.entityreader.IEntityReader; +import org.scify.jedai.datareader.groundtruthreader.GtSerializationReader; +import org.scify.jedai.datareader.groundtruthreader.IGroundTruthReader; +import org.scify.jedai.entityclustering.IEntityClustering; +import org.scify.jedai.entitymatching.IEntityMatching; +import org.scify.jedai.utilities.BlocksPerformance; +import org.scify.jedai.utilities.ClustersPerformance; +import org.scify.jedai.utilities.datastructures.AbstractDuplicatePropagation; +import org.scify.jedai.utilities.datastructures.BilateralDuplicatePropagation; +import org.scify.jedai.utilities.datastructures.UnilateralDuplicatePropagation; +import org.scify.jedai.utilities.enumerations.BlockBuildingMethod; +import org.scify.jedai.utilities.enumerations.BlockCleaningMethod; +import org.scify.jedai.utilities.enumerations.ComparisonCleaningMethod; +import org.scify.jedai.utilities.enumerations.EntityClusteringCcerMethod; +import org.scify.jedai.utilities.enumerations.EntityClusteringDerMethod; +import org.scify.jedai.utilities.enumerations.EntityMatchingMethod; + +/** + * + * @author GAP2 + */ +public class Main { + + private final static String MAIN_DIR_CCER_DATASETS = "data" + File.separator + "cleanCleanErDatasets" + File.separator; + private final static String MAIN_DIR_DER_DATASETS = "data" + File.separator + "dirtyErDatasets" + File.separator; + private final static String[] CCER_ENTITY_FILEPATHS = {"abtProfiles", "buyProfiles", + "amazonProfiles", "gpProfiles", + "dblpProfiles", "acmProfiles", + "dblpProfiles2", "scholarProfiles", + "imdbProfiles", "dbpediaProfiles" + }; + private final static String[] CCER_GROUNDTRUTH_FILEPATHS = {"abtBuyIdDuplicates", + "amazonGpIdDuplicates", + "dblpAcmIdDuplicates", + "dblpScholarIdDuplicates", + "moviesIdDuplicates" + }; + private final static String[] DER_FILEPATHS = {"abtBuy", "amazonGp", "cddb", "census", "cora", "dblpAcm", "dblpScholar", "movies", "restaurant"}; + private final static String[] ER_TYPES = {"Clean-clean Entity Resolution", "Dirty Entity Resolution"}; + private final static String[] CCER_DATASETS = {"Abt-Buy", "DBLP-ACM", "DBLP-Scholar", "Amazon-Google Products", "IMDB-DBPedia Movies"}; + private final static String[] DER_DATASETS = {"Restaurant", "Census", "Cora", "CdDb", "Abt-By", "DBLP-ACM", "DBLP-Scholar", "Amazon-Google Products", "Movies"}; + private final static String[] BLOCK_BUILDING_METHODS = {"Extended Q-Grams Blocking", "Extended Sorted Neighborhood", "Extended Suffix Arrays Blocking", "Q-Grams Blocking", "Sorted Neighborhood", "Standard/Token Blocking", "Suffix Arrays Blocking"}; + private final static String[] BLOCK_CLEANING_METHODS = {"Block Filtering", "Comparison-based Block Purging", "Size-based Block Purging"}; + private final static String[] COMPARISON_CLEANING_METHODS = {"Cardinality Edge Pruning", "Cardinality Node Pruning", "Comparison Propagation", "Reciprocal Cardinality Node Pruning", "Reciprocal Weighed Node Pruning", "Weighed Edge Pruning", "Weighed Node Pruning"}; + private final static String[] ENTITY_MATCHING_METHODS = {"Group Linkage", "Profile Matcher"}; + private final static String[] DIRTY_ER_ENTITY_CLUSTERING_METHODS = {"Center Clustering", "Connected Components Clustering", "Cut Clustering", "Markov Clustering", "Merge-Center Clustering", "Ricochet SR Clustering"}; + + private static TIntList readMultipleInt(String message, String[] array) { + System.out.println("\n\n" + message); + for (int i = 0; i < array.length; i++) { + System.out.println((i + 1) + " - " + array[i]); + } + System.out.println("This is an optional step. You can select none or all options. Choose -1 to terminate this step!"); + + final TIntList selectedIds = new TIntArrayList(); + while (true) { + int userInt; + Scanner keyboard = new Scanner(System.in); + + try { + userInt = keyboard.nextInt(); + } catch (Exception ex) { + System.out.println("Invalid input. Please choose between 1 and " + array.length); + continue; + } + + if (userInt == -1) { + break; + } + + if (userInt < 1 || userInt > array.length) { + System.out.println("Invalid input. Please choose between 1 and " + array.length); + continue; + } + + if (selectedIds.contains(userInt)) { + System.out.println("You have already selected this option!"); + continue; + } + + selectedIds.add(userInt); + System.out.println(array[userInt - 1] + " has been selected!"); + } + + return selectedIds; + } + + private static int readInt(String message, String[] array) { + System.out.println("\n\n" + message); + for (int i = 0; i < array.length; i++) { + System.out.println((i + 1) + " - " + array[i]); + } + + int userInt; + while (true) { + Scanner keyboard = new Scanner(System.in); + + try { + userInt = keyboard.nextInt(); + } catch (Exception ex) { + System.out.println("Invalid input. Please choose between 1 and " + array.length); + continue; + } + + if (userInt < 1 || userInt > array.length) { + System.out.println("Invalid input. Please choose between 1 and " + array.length); + continue; + } + + break; + } + + System.out.println(array[userInt - 1] + " has been selected!"); + return userInt; + } + + private static int readOptionalInt(String message, String[] array) { + System.out.println("\n\n" + message); + for (int i = 0; i < array.length; i++) { + System.out.println((i + 1) + " - " + array[i]); + } + System.out.println("This is an optional step. Choose -1 to select nothing!"); + + int userInt; + while (true) { + Scanner keyboard = new Scanner(System.in); + + try { + userInt = keyboard.nextInt(); + } catch (Exception ex) { + System.out.println("Invalid input. Please choose between 1 and " + array.length); + continue; + } + + if (userInt == -1) { + System.out.println("No option was selected!"); + return -1; + } + + if (userInt < 1 || userInt > array.length) { + System.out.println("Invalid input. Please choose between 1 and " + array.length); + continue; + } + + break; + } + + if (0 <= userInt) { + System.out.println(array[userInt - 1] + " has been selected!"); + } + + return userInt; + } + + private static int getErType() { + String message = "Please choose the type of Entity Resolution that will be applied:"; + return readInt(message, ER_TYPES); + } + + private static int getCleanCleanErDataset() { + String message = "Please choose one of the available Clean-clean ER datasets:"; + return readInt(message, CCER_DATASETS); + } + + private static int getDirtyErDataset() { + String message = "Please choose one of the available Dirty ER datasets:"; + return readInt(message, DER_DATASETS); + } + + private static int getBlockBuildingMethod() { + String message = "Please choose one of the available Block Building methods:"; + return readInt(message, BLOCK_BUILDING_METHODS); + } + + private static TIntList getBlockCleaningMethod() { + String message = "Please choose one, several or none of the available Block Cleaning methods:"; + return readMultipleInt(message, BLOCK_CLEANING_METHODS); + } + + private static int getComparisonCleaningMethod() { + String message = "Please choose at most one of the available Comparison Cleaning methods:"; + return readOptionalInt(message, COMPARISON_CLEANING_METHODS); + } + + private static int getEntityMatchingMethod() { + String message = "Please choose one of the available Entity Matching methods:"; + return readInt(message, ENTITY_MATCHING_METHODS); + } + + private static int getEntityClusteringMethod() { + String message = "Please choose one of the available Entity Clustering methods for Dirty ER:"; + return readInt(message, DIRTY_ER_ENTITY_CLUSTERING_METHODS); + } + + public static void main(String[] args) { + BasicConfigurator.configure(); + + System.out.println("\n\nWelcome to JedAI-core command line interface."); + + // Entity Resolution type selection + int erType = getErType(); + + // Data Reading + AbstractDuplicatePropagation duplicatePropagation; + List profilesD1, profilesD2; + if (erType == 1) { + int datasetId = getCleanCleanErDataset(); + + IEntityReader eReader1 = new EntitySerializationReader(MAIN_DIR_CCER_DATASETS + CCER_ENTITY_FILEPATHS[datasetId * 2]); + profilesD1 = eReader1.getEntityProfiles(); + System.out.println("Input Entity Profiles D1\t:\t" + profilesD1.size()); + + IEntityReader eReader2 = new EntitySerializationReader(MAIN_DIR_CCER_DATASETS + CCER_ENTITY_FILEPATHS[datasetId * 2 + 1]); + profilesD2 = eReader2.getEntityProfiles(); + System.out.println("Input Entity Profiles D2\t:\t" + profilesD2.size()); + + IGroundTruthReader gtReader = new GtSerializationReader(MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS[datasetId]); + duplicatePropagation = new BilateralDuplicatePropagation(gtReader.getDuplicatePairs(null)); + System.out.println("Existing Duplicates\t:\t" + duplicatePropagation.getDuplicates().size()); + } else { + profilesD2 = null; + int datasetId = getDirtyErDataset(); + + IEntityReader eReader = new EntitySerializationReader(MAIN_DIR_DER_DATASETS + DER_FILEPATHS[datasetId] + "Profiles"); + profilesD1 = eReader.getEntityProfiles(); + System.out.println("Input Entity Profiles\t:\t" + profilesD1.size()); + + IGroundTruthReader gtReader = new GtSerializationReader(MAIN_DIR_DER_DATASETS + DER_FILEPATHS[datasetId] + "IdDuplicates"); + duplicatePropagation = new UnilateralDuplicatePropagation(gtReader.getDuplicatePairs(eReader.getEntityProfiles())); + System.out.println("Existing Duplicates\t:\t" + duplicatePropagation.getDuplicates().size()); + } + + StringBuilder workflowConf = new StringBuilder(); + StringBuilder workflowName = new StringBuilder(); + + // Block Building + int bbMethodId = getBlockBuildingMethod(); + double time1 = System.currentTimeMillis(); + + IBlockBuilding blockBuildingMethod = BlockBuildingMethod.getDefaultConfiguration(BlockBuildingMethod.values()[bbMethodId - 1]); + List blocks = blockBuildingMethod.getBlocks(profilesD1, profilesD2); + + double time2 = System.currentTimeMillis(); + + workflowConf.append(blockBuildingMethod.getMethodConfiguration()); + workflowName.append(blockBuildingMethod.getMethodName()); + + BlocksPerformance blStats = new BlocksPerformance(blocks, duplicatePropagation); + blStats.setStatistics(); + blStats.printStatistics(time2 - time1, workflowConf.toString(), workflowName.toString()); + + // Block Cleaning + final TIntList bcMethodIds = getBlockCleaningMethod(); + if (!bcMethodIds.isEmpty()) { + bcMethodIds.sort(); + bcMethodIds.reverse(); + final TIntIterator iterator = bcMethodIds.iterator(); + while (iterator.hasNext()) { + double time3 = System.currentTimeMillis(); + + IBlockProcessing blockCleaningMethod = BlockCleaningMethod.getDefaultConfiguration(BlockCleaningMethod.values()[iterator.next() - 1]); + blocks = blockCleaningMethod.refineBlocks(blocks); + + double time4 = System.currentTimeMillis(); + + workflowConf.append("\n").append(blockCleaningMethod.getMethodConfiguration()); + workflowName.append("->").append(blockCleaningMethod.getMethodName()); + + blStats = new BlocksPerformance(blocks, duplicatePropagation); + blStats.setStatistics(); + blStats.printStatistics(time4 - time3, workflowConf.toString(), workflowName.toString()); + } + } + + // Comparison Cleaning + int ccMethodId = getComparisonCleaningMethod(); + if (0 <= ccMethodId) { + double time5 = System.currentTimeMillis(); + + IBlockProcessing comparisonCleaningMethod = ComparisonCleaningMethod.getDefaultConfiguration(ComparisonCleaningMethod.values()[ccMethodId - 1]); + blocks = comparisonCleaningMethod.refineBlocks(blocks); + + double time6 = System.currentTimeMillis(); + + workflowConf.append("\n").append(comparisonCleaningMethod.getMethodConfiguration()); + workflowName.append("->").append(comparisonCleaningMethod.getMethodName()); + + blStats = new BlocksPerformance(blocks, duplicatePropagation); + blStats.setStatistics(); + blStats.printStatistics(time6 - time5, blockBuildingMethod.getMethodConfiguration(), blockBuildingMethod.getMethodName()); + } + + // Entity Matching + int emMethodId = getEntityMatchingMethod(); + double time7 = System.currentTimeMillis(); + + IEntityMatching entityMatchingMethod = EntityMatchingMethod.getDefaultConfiguration(EntityMatchingMethod.values()[emMethodId - 1]); + SimilarityPairs simPairs = entityMatchingMethod.executeComparisons(blocks, profilesD1, profilesD2); + + double time8 = System.currentTimeMillis(); + + workflowConf.append("\n").append(entityMatchingMethod.getMethodConfiguration()); + workflowName.append("->").append(entityMatchingMethod.getMethodName()); + System.out.println("Entity Matching overhead time\t:\t" + (time8 - time7)); + + // Entity Clustering + IEntityClustering entityClusteringMethod; + if (erType == 1) { // Clean-Clean ER + System.out.println("\n\nUnique Mapping Clustering is the only Entity Clustering method compatible with Clean-Clean ER"); + entityClusteringMethod = EntityClusteringCcerMethod.getDefaultConfiguration(EntityClusteringCcerMethod.UNIQUE_MAPPING_CLUSTERING); + } else { // Dirty ER + int ecMethodId = getEntityClusteringMethod(); + entityClusteringMethod = EntityClusteringDerMethod.getDefaultConfiguration(EntityClusteringDerMethod.values()[ecMethodId - 1]); + } + + long time9 = System.currentTimeMillis(); + + List entityClusters = entityClusteringMethod.getDuplicates(simPairs); + + long time10 = System.currentTimeMillis(); + + workflowConf.append("\n").append(entityClusteringMethod.getMethodConfiguration()); + workflowName.append("->").append(entityClusteringMethod.getMethodName()); + + ClustersPerformance clp = new ClustersPerformance(entityClusters, duplicatePropagation); + clp.setStatistics(); + clp.printStatistics(time10 - time9, workflowConf.toString(), workflowName.toString()); + + } +}