From 690e3a42d294b70ee261ec2a2b9d7984a5402e95 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Fri, 24 May 2024 14:56:16 +0200 Subject: [PATCH] feat(post): add alphanumeric postcodes post-processing script --- Document.js | 1 + post/alphanumeric_postcodes.js | 51 +++++++++++++++++++ test/document/post.js | 6 ++- test/post/alphanumeric_postcodes.js | 76 +++++++++++++++++++++++++++++ test/run.js | 7 +-- 5 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 post/alphanumeric_postcodes.js create mode 100644 test/post/alphanumeric_postcodes.js diff --git a/Document.js b/Document.js index e838830..2ae9702 100644 --- a/Document.js +++ b/Document.js @@ -41,6 +41,7 @@ function Document( source, layer, source_id ){ // define default post-processing scripts this.addPostProcessingScript( require('./post/intersections') ); this.addPostProcessingScript( require('./post/seperable_street_names').post ); + this.addPostProcessingScript( require('./post/alphanumeric_postcodes') ); this.addPostProcessingScript( require('./post/deduplication') ); this.addPostProcessingScript( require('./post/language_field_trimming') ); this.addPostProcessingScript( require('./post/popularity') ); diff --git a/post/alphanumeric_postcodes.js b/post/alphanumeric_postcodes.js new file mode 100644 index 0000000..c83aa8c --- /dev/null +++ b/post/alphanumeric_postcodes.js @@ -0,0 +1,51 @@ +const _ = require('lodash'); +const ADDRESS_LAYER_NAME = 'address'; +const ALPHANUMERIC_POSTCODE = /^(\d{4})\s*([^\d]{2})$/; + +/** + * Alphanumeric postcodes post-processing script ensures that both the expanded + * and contracted version of alphanumeric postcodes are indexed. + * + * Without this script a postcode such as '1383GN' would not be matched to the + * query '1383'. + * + * The script is intended to detect these alphanumeric postcodes and index both + * permutations, ie. '1383GN' = ['1383GN', '1383 GN']. + * + * The inverse case should also be covered. ie. '1383 GN' = ['1383 GN', '1383GN']. + * + * Note: the regex is currently restrictive by design, the UK for instance uses + * alphanumeric postcodes in the format 'E81DN' which could cause error when splitting + * with this method, they are currently ignored. Future work should consider global + * postcode formats. + * + * Note: this script is intended to run *before* the 'deduplication' post processing + * script that prior aliases don't generate duplicate terms. + */ + +function postcodes( doc ){ + + // only apply to docs from the address layer + if( doc.getLayer() !== ADDRESS_LAYER_NAME ){ return; } + + // ensure postcode is set + let postcode = doc.getAddress('zip'); + if( !_.isString(postcode) || _.isEmpty(postcode) ){ return; } + + // ensure postcode is alphanumeric + let matches = postcode.match(ALPHANUMERIC_POSTCODE); + if( matches.length !== 3 ){ return; } + + // generate postcode aliases for the postcode. + let [ match, numeric, alpha ] = matches; + + // detect if the existing postcode is expanded or not + let isExpanded = /\s/.test(match); + if ( isExpanded ) { + doc.setAddressAlias('zip', `${numeric}${alpha}`); // add contracted form as alias + } else { + doc.setAddressAlias('zip', `${numeric} ${alpha}`); // add expanded form as alias + } +} + +module.exports = postcodes; \ No newline at end of file diff --git a/test/document/post.js b/test/document/post.js index 67c53e7..897536f 100644 --- a/test/document/post.js +++ b/test/document/post.js @@ -2,10 +2,14 @@ const Document = require('../../Document'); const intersections = require('../../post/intersections'); const seperable_street_names = require('../../post/seperable_street_names').post; +const alphanumeric_postcodes = require('../../post/alphanumeric_postcodes'); const deduplication = require('../../post/deduplication'); const language_field_trimming = require('../../post/language_field_trimming'); const popularity = require('../../post/popularity'); -const DEFAULT_SCRIPTS = [intersections, seperable_street_names, deduplication, language_field_trimming, popularity]; +const DEFAULT_SCRIPTS = [ + intersections, seperable_street_names, alphanumeric_postcodes, + deduplication, language_field_trimming, popularity +]; module.exports.tests = {}; diff --git a/test/post/alphanumeric_postcodes.js b/test/post/alphanumeric_postcodes.js new file mode 100644 index 0000000..ca36c65 --- /dev/null +++ b/test/post/alphanumeric_postcodes.js @@ -0,0 +1,76 @@ +const Document = require('../../Document'); +const postcodes = require('../../post/alphanumeric_postcodes'); + +module.exports.tests = {}; + +module.exports.tests.alias = function(test) { + test('expand', function(t) { + const doc = new Document('mysource','address','myid'); + + // zip not set + postcodes(doc); + t.deepEqual(doc.getAddressAliases('zip'), [], 'no alias set'); + + // set postcode + doc.setAddress('zip', '1383GN'); + + // add expanded version + postcodes(doc); + t.deepEqual(doc.getAddressAliases('zip'), ['1383 GN'], 'alias set'); + + t.end(); + }); + test('contract', function(t) { + const doc = new Document('mysource','address','myid'); + + // zip not set + postcodes(doc); + t.deepEqual(doc.getAddressAliases('zip'), [], 'no alias set'); + + // set postcode + doc.setAddress('zip', '1383 GN'); + + // add contracted version + postcodes(doc); + t.deepEqual(doc.getAddressAliases('zip'), ['1383GN'], 'alias set'); + + t.end(); + }); +}; + +module.exports.tests.noop = function(test) { + test('noop: invalid layer != "address"', function(t) { + const doc = new Document('mysource','not_address','myid'); + + // set postcode + doc.setAddress('zip', '1383GN'); + + // no alias added + t.deepEqual(doc.getAddressAliases('zip'), [], 'no alias set'); + + t.end(); + }); + + test('noop: postcode doesnt match regex', function(t) { + const doc = new Document('mysource','address','myid'); + + // set postcode + doc.setAddress('zip', 'E81DN'); + + // no alias added + t.deepEqual(doc.getAddressAliases('zip'), [], 'no alias set'); + + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('post/alphanumeric_postcodes: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/run.js b/test/run.js index 995f2b4..0d0bfd7 100644 --- a/test/run.js +++ b/test/run.js @@ -1,7 +1,7 @@ -var tape = require('tape'); -var common = {}; +const tape = require('tape'); +const common = {}; -var tests = [ +const tests = [ require('./Document.js'), require('./errors.js'), require('./document/centroid.js'), @@ -23,6 +23,7 @@ var tests = [ require('./document/toESDocument.js'), require('./document/post.js'), require('./post/intersections.js'), + require('./post/alphanumeric_postcodes.js'), require('./post/deduplication.js'), require('./post/seperable_street_names.js'), require('./post/language_field_trimming.js'),