From 91b4c501930bb6c589d24dd84eda1db747b29a46 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Sat, 19 May 2018 22:59:54 -0400 Subject: [PATCH] feat(libpostal): Use libpostal service BREAKING CHANGE Use microservice-wrapper to avoid having to load libpostal locally. Note: this now requires a new configuration section in `pelias.json`, a top-level `services` key with the usual properties. Here's an example full `pelias.json`: ``` { "api": { "textAnalyzer": "libpostal" }, "services": { "libpostal": { "url": "http://libpostal-service-url:8080", "timeout": 4000 } } } ``` Fixes https://github.com/pelias/interpolation/issues/106 --- .jshintrc | 2 +- api/search.js | 199 +++++++++++++++++++------------------ cmd/server.js | 2 +- lib/analyze.js | 50 ++++------ lib/libpostal_wrapper.js | 22 ++++ libpostal/service.js | 49 +++++++++ readme.md | 3 - stream/address/lookup.js | 9 +- stream/street/augment.js | 67 +++++++------ test/lib/analyze.js | 21 ++-- test/lib/mock_libpostal.js | 24 +++-- 11 files changed, 261 insertions(+), 187 deletions(-) create mode 100644 lib/libpostal_wrapper.js create mode 100644 libpostal/service.js diff --git a/.jshintrc b/.jshintrc index 159fb604..aa3f1574 100644 --- a/.jshintrc +++ b/.jshintrc @@ -2,7 +2,7 @@ "node": true, "curly": true, "eqeqeq": true, - "esversion": 6, + "esversion": 8, "freeze": true, "immed": true, "indent": 2, diff --git a/api/search.js b/api/search.js index e7a86bda..055368a3 100644 --- a/api/search.js +++ b/api/search.js @@ -31,120 +31,123 @@ function setup( addressDbPath, streetDbPath ){ if( 'string' !== typeof number ){ return cb( 'invalid number' ); } if( 'string' !== typeof street ){ return cb( 'invalid street' ); } - var normalized = { - number: analyze.housenumber( number ), - street: analyze.street( street ) - }; + analyze.street(street, function streetAnalyzeCallback(err, street, metadata) { - // error checking - if( isNaN( point.lat ) ){ return cb( 'invalid latitude' ); } - if( isNaN( point.lon ) ){ return cb( 'invalid longitude' ); } - if( isNaN( normalized.number ) ){ return cb( 'invalid number' ); } - if( !normalized.street.length ){ return cb( 'invalid street' ); } + var normalized = { + number: analyze.housenumber( number ), + street: street + }; - // perform a db lookup for the specified street - // @todo: perofmance: only query for part of the table - query.search( db, point, normalized.number, normalized.street, function( err, res ){ + // error checking + if( isNaN( point.lat ) ){ return cb( 'invalid latitude' ); } + if( isNaN( point.lon ) ){ return cb( 'invalid longitude' ); } + if( isNaN( normalized.number ) ){ return cb( 'invalid number' ); } + if( !normalized.street.length ){ return cb( 'invalid street' ); } - // @note: results can be from multiple different street ids. + // perform a db lookup for the specified street + // @todo: perofmance: only query for part of the table + query.search( db, point, normalized.number, normalized.street, function( err, res ){ - // an error occurred or no results were found - if( err || !res || !res.length ){ return cb( err, null ); } + // @note: results can be from multiple different street ids. - // try to find an exact match - var match = res.find( function( row ){ - if( row.source === 'VERTEX' ){ return false; } - return row.housenumber === normalized.number; - }); + // an error occurred or no results were found + if( err || !res || !res.length ){ return cb( err, null ); } - // return exact match - if( match ){ - return cb( null, { - type: 'exact', - source: match.source, - source_id: match.source_id, - number: analyze.housenumberFloatToString( match.housenumber ), - lat: parseFloat( match.lat.toFixed(7) ), - lon: parseFloat( match.lon.toFixed(7) ) + // try to find an exact match + var match = res.find( function( row ){ + if( row.source === 'VERTEX' ){ return false; } + return row.housenumber === normalized.number; }); - } - // try to find a close match with the same number (possibly an apartment) - match = res.find( function( row ){ - if( row.source === 'VERTEX' ){ return false; } - return Math.floor( row.housenumber ) === Math.floor( normalized.number ); - }); + // return exact match + if( match ){ + return cb( null, { + type: 'exact', + source: match.source, + source_id: match.source_id, + number: analyze.housenumberFloatToString( match.housenumber ), + lat: parseFloat( match.lat.toFixed(7) ), + lon: parseFloat( match.lon.toFixed(7) ) + }); + } - // return close match - if( match ){ - return cb( null, { - type: 'close', - source: match.source, - source_id: match.source_id, - number: analyze.housenumberFloatToString( match.housenumber ), - lat: parseFloat( match.lat.toFixed(7) ), - lon: parseFloat( match.lon.toFixed(7) ) + // try to find a close match with the same number (possibly an apartment) + match = res.find( function( row ){ + if( row.source === 'VERTEX' ){ return false; } + return Math.floor( row.housenumber ) === Math.floor( normalized.number ); }); - } - - // attempt to interpolate the position - - // find the records before and after the desired number (group by street segment) - var map = {}; - res.forEach( function( row ){ - if( !map.hasOwnProperty( row.id ) ){ map[row.id] = {}; } - if( row.housenumber < normalized.number ){ map[row.id].before = row; } - if( row.housenumber > normalized.number ){ map[row.id].after = row; } - if( map[row.id].before && map[row.id].after ){ - map[row.id].diff = { - before: map[row.id].before.housenumber - normalized.number, - after: map[row.id].after.housenumber - normalized.number - }; + + // return close match + if( match ){ + return cb( null, { + type: 'close', + source: match.source, + source_id: match.source_id, + number: analyze.housenumberFloatToString( match.housenumber ), + lat: parseFloat( match.lat.toFixed(7) ), + lon: parseFloat( match.lon.toFixed(7) ) + }); } - }); - // remove segments with less than 2 points; convert map to array - var segments = []; - for( var id in map ){ - if( map[id].before && map[id].after ){ - segments.push( map[id] ); + // attempt to interpolate the position + + // find the records before and after the desired number (group by street segment) + var map = {}; + res.forEach( function( row ){ + if( !map.hasOwnProperty( row.id ) ){ map[row.id] = {}; } + if( row.housenumber < normalized.number ){ map[row.id].before = row; } + if( row.housenumber > normalized.number ){ map[row.id].after = row; } + if( map[row.id].before && map[row.id].after ){ + map[row.id].diff = { + before: map[row.id].before.housenumber - normalized.number, + after: map[row.id].after.housenumber - normalized.number + }; + } + }); + + // remove segments with less than 2 points; convert map to array + var segments = []; + for( var id in map ){ + if( map[id].before && map[id].after ){ + segments.push( map[id] ); + } } - } - // could not find two rows to use for interpolation - if( !segments.length ){ - return cb( null, null ); - } + // could not find two rows to use for interpolation + if( !segments.length ){ + return cb( null, null ); + } - // sort by miniumum housenumber difference from target housenumber ASC - segments.sort( function( a, b ){ - return Math.abs( a.diff.before + a.diff.after ) - Math.abs( b.diff.before + b.diff.after ); - }); + // sort by miniumum housenumber difference from target housenumber ASC + segments.sort( function( a, b ){ + return Math.abs( a.diff.before + a.diff.after ) - Math.abs( b.diff.before + b.diff.after ); + }); - // select before/after values to use for the interpolation - var before = segments[0].before; - var after = segments[0].after; - - // compute interpolated address - var A = { lat: project.toRad( before.proj_lat ), lon: project.toRad( before.proj_lon ) }; - var B = { lat: project.toRad( after.proj_lat ), lon: project.toRad( after.proj_lon ) }; - var distance = geodesic.distance( A, B ); - - // if distance = 0 then we can simply use either A or B (they are the same lat/lon) - // else we interpolate between the two positions - var point = A; - if( distance > 0 ){ - var ratio = ((normalized.number - before.housenumber) / (after.housenumber - before.housenumber)); - point = geodesic.interpolate( distance, ratio, A, B ); - } - - // return interpolated address - return cb( null, { - type: 'interpolated', - source: 'mixed', - number: '' + Math.floor( normalized.number ), - lat: parseFloat( project.toDeg( point.lat ).toFixed(7) ), - lon: parseFloat( project.toDeg( point.lon ).toFixed(7) ) + // select before/after values to use for the interpolation + var before = segments[0].before; + var after = segments[0].after; + + // compute interpolated address + var A = { lat: project.toRad( before.proj_lat ), lon: project.toRad( before.proj_lon ) }; + var B = { lat: project.toRad( after.proj_lat ), lon: project.toRad( after.proj_lon ) }; + var distance = geodesic.distance( A, B ); + + // if distance = 0 then we can simply use either A or B (they are the same lat/lon) + // else we interpolate between the two positions + var point = A; + if( distance > 0 ){ + var ratio = ((normalized.number - before.housenumber) / (after.housenumber - before.housenumber)); + point = geodesic.interpolate( distance, ratio, A, B ); + } + + // return interpolated address + return cb( null, { + type: 'interpolated', + source: 'mixed', + number: '' + Math.floor( normalized.number ), + lat: parseFloat( project.toDeg( point.lat ).toFixed(7) ), + lon: parseFloat( project.toDeg( point.lon ).toFixed(7) ) + }); }); }); }; diff --git a/cmd/server.js b/cmd/server.js index 9fd6c1b9..61d63bfd 100644 --- a/cmd/server.js +++ b/cmd/server.js @@ -211,7 +211,7 @@ app.use('/demo', express.static('demo')); app.listen( PORT, function() { // force loading of libpostal - analyze.street( 'test street' ); + //analyze.street( 'test street', function() {} ); console.log( 'server listening on port', PORT ); }); diff --git a/lib/analyze.js b/lib/analyze.js index cc13cf99..4e95b34b 100644 --- a/lib/analyze.js +++ b/lib/analyze.js @@ -1,3 +1,4 @@ +const libpostal_service = require( './libpostal_wrapper' ); // constants for controlling how we parse ranges, eg: 'α-β' // some ranges such as '1-7' are ambiguous; it could mean 'apt 7, no 1'; or // it could mean 'apt 1, no 7'; or could even be a valid range 'one to seven'. @@ -7,47 +8,30 @@ var MIN_RANGE = 1; // the miniumum amount β is higher than α var MAX_RANGE = 6; // the maximum amount β is higher than α var MIN_RANGE_HOUSENUMBER = 10; // the minimum acceptible value for both α and β -/* - * Return the appropriate version of node-postal - */ - -var _nodepostal_module; -function get_libpostal() { - // lazy load this dependency; since it's large (~2GB RAM) and may be - // accidentally required by a process which doesn't use it. - if (!_nodepostal_module) { - // load the mock library if MOCK_LIBPOSTAL env var is set - if (process.env.MOCK_LIBPOSTAL) { - _nodepostal_module = require('../test/lib/mock_libpostal'); - // otherwise load the real thing - } else { - _nodepostal_module = require('node-postal'); - } - } - - return _nodepostal_module; -} - /** analyze input streetname string and return a list of expansions. **/ -function street( streetName ){ - const postal = get_libpostal(); +function street( streetName, callback ){ + const postal = libpostal_service(); // use libpostal to expand the address - var expansions = postal.expand.expand_address( streetName ); + postal.expand.expand_address( streetName, function streetCallback(err, results, metadata) { + if (err) { + return callback(err); + } - // remove ordinals - expansions = expansions.map(function( item ){ - return item.replace( /(([0-9]+)(st|nd|rd|th)($|\s))/gi, '$2 ' ).trim(); - }); + // remove ordinals + let expansions = results.map(function( item ){ + return item.replace( /(([0-9]+)(st|nd|rd|th)($|\s))/gi, '$2 ' ).trim(); + }); - // remove duplicates - expansions = expansions.filter(function(item, pos, self) { - return self.indexOf(item) === pos; - }); + // remove duplicates + expansions = expansions.filter(function(item, pos, self) { + return self.indexOf(item) === pos; + }); - return expansions; + callback(null, expansions, metadata); + }); } /** diff --git a/lib/libpostal_wrapper.js b/lib/libpostal_wrapper.js new file mode 100644 index 00000000..fe6048a1 --- /dev/null +++ b/lib/libpostal_wrapper.js @@ -0,0 +1,22 @@ +const mock_libpostal = require('../test/lib/mock_libpostal'); + +// This module is a wrapper around the actual libpostal service library +// and the mock libpostal library +// it allows an environment variable to switch which library is used in application code + +let libpostal_module; +function get_libpostal() { + // return the mock library if MOCK_LIBPOSTAL env var is set + if (process.env.MOCK_LIBPOSTAL) { + return mock_libpostal; + // otherwise return the actual service + } else { + // lazy load the libpostal module so that tests can skip configuring the service + if (!libpostal_module) { + libpostal_module = require( '../libpostal/service' ); + } + return libpostal_module; + } +} + +module.exports = get_libpostal; diff --git a/libpostal/service.js b/libpostal/service.js new file mode 100644 index 00000000..741a0d3c --- /dev/null +++ b/libpostal/service.js @@ -0,0 +1,49 @@ +// deasync is used to proved a sync-looking interface +// to the async call to the libpostal service +const microservice_wrapper = require('pelias-microservice-wrapper'); +const pelias_config = require('pelias-config').generate(); + +const LibpostalServiceConfig = class extends microservice_wrapper.ServiceConfiguration { + constructor(configBlob) { + super('libpostal', configBlob); + } + getUrl(params) { + return this.baseUrl + params.endpoint; + } + getParameters(params) { + return { + address: params.address + }; + } +}; + +// use the 'services.libpostal' config entry if available, otherwise fall back to 'api.services.libpostal' +const config_entry = pelias_config.get('services.libpostal') || pelias_config.get('api.services.libpostal'); + +if (!config_entry) { + throw new Error('Libpostal configuration not found in `services.libpostal` or `api.services.libpostal`'); +} + +// create an instance of the libpostal service +const libpostal_service = microservice_wrapper.service( + new LibpostalServiceConfig(config_entry) +); + +// create an object that looks like the interface to `node-postal` but uses a remote service +module.exports = { + expand: { + expand_address: function(param, callback) { + const params = { + endpoint: 'expand', + address: param + }; + + // the libpostal service will not handle an empty parameter + // so return empty array immediately + if (!param) { + return callback(null, []); + } + libpostal_service(params, callback); + } + } +}; diff --git a/readme.md b/readme.md index 1884c712..e6e444bf 100644 --- a/readme.md +++ b/readme.md @@ -238,7 +238,6 @@ see: [source](https://github.com/pelias/interpolation/blob/master/cmd/server.js) # docker ### build docker image -This can take some time for the first build due to installing libpostal from source. ```bash docker build -t pelias/interpolation . ``` @@ -438,8 +437,6 @@ To use Interpolation service with the Pelias API, [configure the pelias config f ### install dependencies -*note:* [libpostal](https://github.com/openvenues/node-postal#troubleshooting) **must** be installed on your system before you continue! - The `Dockerfile` in this repo has complete instructions on how to install everything from scratch on Ubuntu. ### TIGER dependency on GDAL diff --git a/stream/address/lookup.js b/stream/address/lookup.js index e764d96f..ecd45906 100644 --- a/stream/address/lookup.js +++ b/stream/address/lookup.js @@ -1,5 +1,5 @@ - var fs = require('fs'), + util = require('util'), through = require('through2'), query = { lookup: require('../../query/lookup') }, project = require('../../lib/project'), @@ -15,9 +15,11 @@ if( hasFD3 ){ process.conferr.on( 'error', function(){ process.conferr = { write: function noop(){} }; }); } +const analyze_street = util.promisify(analyze.street); + function streamFactory(db){ - return through.obj(function( batch, _, next ){ + return through.obj(async function( batch, _, next ){ // invalid batch if( !batch || !batch.length ){ @@ -30,7 +32,8 @@ function streamFactory(db){ // all street names in batch should be the same // perform libpostal normalization - var names = analyze.street( result.getStreet() ); + + var names = await analyze_street( result.getStreet() ); // ensure at least one name was produced if( !names.length ){ diff --git a/stream/street/augment.js b/stream/street/augment.js index 1032bcde..d27613a1 100644 --- a/stream/street/augment.js +++ b/stream/street/augment.js @@ -1,12 +1,15 @@ - -var through = require('through2'), - analyze = require('../../lib/analyze'); +const util = require('util'); +const _ = require('lodash'); +const through = require('through2'); +const analyze = require('../../lib/analyze'); // increase/decrease bbox bounds by this much in order to find houses which // might be slighly outside the bounds. // eg: http://geojson.io/#id=gist:anonymous/ce8b0cdd2ba83ef24cfaab49d36d8cdd&map=15/52.5011/13.3222 var FUDGE_FACTOR = 0.005; +const analyze_street = util.promisify(analyze.street); + /** this stream augments the parsed data with additional fields. @@ -14,37 +17,41 @@ var FUDGE_FACTOR = 0.005; - perform libpostal normalization - apply 'fudge factor' to bbox **/ +let i = 0; function streamFactory(){ - return through.obj(function( street, _, next ){ + return through.obj(async function( street, enc, next ){ // normalize all names - var names = []; - street.getNames().forEach( function( name ){ - names = names.concat( analyze.street( name ) ); - }); - - // if the source file contains no valid names for this polyline - if( !names.length ){ - console.error( 'street has no valid names, check your 0sv file:' ); - console.error( street.getEncodedPolyline() ); - return next(); - } - - street.setNames( names ); - - // expand bbox - var bbox = street.getBbox(); - street.setBbox({ - minX: bbox.minX -FUDGE_FACTOR, - minY: bbox.minY -FUDGE_FACTOR, - maxX: bbox.maxX +FUDGE_FACTOR, - maxY: bbox.maxY +FUDGE_FACTOR + await Promise.all(street.getNames().map( async (name) => { + return await analyze_street( name ); + })).then(function(names) { + // if the source file contains no valid names for this polyline + if( !names.length ){ + console.error( 'street has no valid names, check your 0sv file:' ); + console.error( street.getEncodedPolyline() ); + return next(); + } + + try { + // an extra level of arrays is added by Promise.all + names = _.flatten(names); + street.setNames( names ); + } catch (e) { + console.error(e); + console.error(`trying to set invalid name ${names}`); + } + + // expand bbox + var bbox = street.getBbox(); + street.setBbox({ + minX: bbox.minX -FUDGE_FACTOR, + minY: bbox.minY -FUDGE_FACTOR, + maxX: bbox.maxX +FUDGE_FACTOR, + maxY: bbox.maxY +FUDGE_FACTOR + }); + + next(null, street); }); - - // push augmented data downstream - this.push( street ); - - next(); }); } diff --git a/test/lib/analyze.js b/test/lib/analyze.js index 9ae15f6f..3364429b 100644 --- a/test/lib/analyze.js +++ b/test/lib/analyze.js @@ -5,19 +5,22 @@ module.exports.analyze = {}; module.exports.analyze.street = function(test) { test('street: synonym expansions', function(t) { - var perms = analyze.street('grolmanstraße'); - t.deepEqual(perms, ['grolmanstraße', 'grolman straße']); - t.end(); + analyze.street('grolmanstraße', function(err, perms) { + t.deepEqual(perms, ['grolmanstraße', 'grolman straße']); + t.end(); + }); }); test('street: remove ordinals', function(t) { - var perms = analyze.street('West 26th st'); - t.deepEqual(perms, ['west 26 street', 'west 26 saint']); - t.end(); + analyze.street('West 26th st', function(err, perms) { + t.deepEqual(perms, ['west 26 street', 'west 26 saint']); + t.end(); + }); }); test('street: always returns array', function(t) { - var perms = analyze.street(''); - t.deepEqual(perms, ['']); - t.end(); + analyze.street('', function(err, perms) { + t.deepEqual(perms, ['']); + t.end(); + }); }); }; diff --git a/test/lib/mock_libpostal.js b/test/lib/mock_libpostal.js index 9d064750..c7188e8c 100644 --- a/test/lib/mock_libpostal.js +++ b/test/lib/mock_libpostal.js @@ -13,25 +13,31 @@ const use_real_libpostal = process.env.SEED_MOCK_LIBPOSTAL !== undefined; let mock_responses = require('../../test/lib/mock_libpostal_responses'); module.exports.expand = { - expand_address: function(input_string) { + expand_address: function(input_string, callback) { const clean_string = input_string.trim().toLowerCase(); // return a mocked response if one is available if (_.has(mock_responses, clean_string)) { - return mock_responses[clean_string]; + return setImmediate(() => { + callback(null, mock_responses[clean_string]); + }); // if no mock response is available but falling back to real libpostal // is enabled, lazy load real libpostal, and return the real response } else if (use_real_libpostal) { // lazy load libpostal only when needed - if (!real_libpostal) { real_libpostal = require('node-postal'); } + if (!real_libpostal) { real_libpostal = require('../../libpostal/service'); } - const real_response = real_libpostal.expand.expand_address(clean_string); - mock_responses[clean_string] = real_response; + real_libpostal.expand.expand_address(clean_string, function(err, real_response, metadata) { + if (err) { + throw err; + } + mock_responses[clean_string] = real_response; - // write the stored list of responses after _every_ new one is added. this is inefficient - // but it does not appear using `process.on('exit')` is reliable - fs.writeFileSync(__dirname +'/../../test/lib/mock_libpostal_responses.json', JSON.stringify(mock_responses, null, 2)); + // write the stored list of responses after _every_ new one is added. this is inefficient + // but it does not appear using `process.on('exit')` is reliable + fs.writeFileSync(__dirname +'/../../test/lib/mock_libpostal_responses.json', JSON.stringify(mock_responses, null, 2)); - return real_response; + return callback(null, real_response, metadata); + }); // if there is no mock response and falling back to real libpostal is disabled, // throw an error because a human has to run libpostal and find the correct response } else {