From 5dfc172ad7d44622caa4d7d0fae4d349d363ec64 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 17:58:16 +0100 Subject: [PATCH 01/16] fix: non-english char encoding --- lib/config/defaults.js | 2 +- lib/plugins/save-resource-to-fs-plugin.js | 2 +- lib/request.js | 16 +++++++++++----- lib/resource.js | 9 +++++++++ lib/scraper.js | 1 + 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 1e79fe63..14e00077 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -48,7 +48,7 @@ const config = { ], request: { throwHttpErrors: false, - encoding: 'binary', + responseType: 'buffer', //cookieJar: true, decompress: true, headers: { diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js index 0e7be17c..b5cfab02 100644 --- a/lib/plugins/save-resource-to-fs-plugin.js +++ b/lib/plugins/save-resource-to-fs-plugin.js @@ -20,7 +20,7 @@ class SaveResourceToFileSystemPlugin { registerAction('saveResource', async ({resource}) => { const filename = path.join(absoluteDirectoryPath, resource.getFilename()); const text = resource.getText(); - await fs.outputFile(filename, text, { encoding: 'binary' }); + await fs.outputFile(filename, text, { encoding: resource.getEncoding() }); loadedResources.push(resource); }); diff --git a/lib/request.js b/lib/request.js index 4ea4e76b..65f6a5f8 100644 --- a/lib/request.js +++ b/lib/request.js @@ -7,20 +7,25 @@ function getMimeType (contentType) { } function defaultResponseHandler ({response}) { - return Promise.resolve(response.body); + return Promise.resolve(response); } -function transformResult (result) { +function transformResult (response) { + const encoding = response.headers?.['content-type']?.includes('utf-8') ? 'utf8' : 'binary'; + const result = response.body.toString(encoding); + switch (true) { case typeof result === 'string': return { body: result, - metadata: null + metadata: null, + encoding }; case isPlainObject(result): return { body: result.body, - metadata: result.metadata || null + metadata: result.metadata || null, + encoding }; case result === null: return null; @@ -50,7 +55,8 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR url: response.url, mimeType: getMimeType(response.headers['content-type']), body: responseHandlerResult.body, - metadata: responseHandlerResult.metadata + metadata: responseHandlerResult.metadata, + encoding: responseHandlerResult.encoding }; } diff --git a/lib/resource.js b/lib/resource.js index f6acd65d..ae78886c 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -12,6 +12,7 @@ class Resource { this.children = []; this.saved = false; + this.encoding = 'binary'; } createChild (url, filename) { @@ -69,6 +70,14 @@ class Resource { return this.type; } + setEncoding (encoding) { + this.encoding = encoding; + } + + getEncoding () { + return this.encoding; + } + isHtml () { return this.getType() === types.html; } diff --git a/lib/scraper.js b/lib/scraper.js index 040a9cd9..e14a3d4d 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -170,6 +170,7 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } + resource.setEncoding(responseData.encoding); resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); From 4826c4dd39a627bb7c573e073f42559844ce7cc4 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 18:08:15 +0100 Subject: [PATCH 02/16] fix: optional chaining not supported --- lib/request.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/request.js b/lib/request.js index 65f6a5f8..8de213d3 100644 --- a/lib/request.js +++ b/lib/request.js @@ -11,7 +11,7 @@ function defaultResponseHandler ({response}) { } function transformResult (response) { - const encoding = response.headers?.['content-type']?.includes('utf-8') ? 'utf8' : 'binary'; + const encoding = response.headers['content-type'] && response.headers['content-type'].includes('utf-8') ? 'utf8' : 'binary'; const result = response.body.toString(encoding); switch (true) { From 55160b84c2e523bdc957e0de4413c965281284be Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 18:31:51 +0100 Subject: [PATCH 03/16] fix: various tests not finished yet --- lib/request.js | 18 +++++++++++++----- test/unit/scraper-test.js | 3 ++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/request.js b/lib/request.js index 8de213d3..781b7479 100644 --- a/lib/request.js +++ b/lib/request.js @@ -11,10 +11,20 @@ function defaultResponseHandler ({response}) { } function transformResult (response) { - const encoding = response.headers['content-type'] && response.headers['content-type'].includes('utf-8') ? 'utf8' : 'binary'; - const result = response.body.toString(encoding); + // Response could be a response object or it could be a raw string/binary/whatever so we need to be safe and try + // to handle when it's not a response object. + + let result = response; + let encoding = 'binary'; + + if (response && typeof response.headers === 'object' && typeof response.body !== 'undefined') { + encoding = response.headers['content-type'] && response.headers['content-type'].includes('utf-8') ? 'utf8' : 'binary'; + result = response.body.toString(encoding); + } switch (true) { + case result === null: + return null; case typeof result === 'string': return { body: result, @@ -25,10 +35,8 @@ function transformResult (response) { return { body: result.body, metadata: result.metadata || null, - encoding + encoding: result.encoding || encoding || 'binary' }; - case result === null: - return null; default: throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); } diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 87ee177c..0cfa780c 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -251,7 +251,7 @@ describe('Scraper', () => { class AddMetadataPlugin { apply (registerAction) { - registerAction('afterResponse', sinon.stub().returns({body: 'test body', metadata})); + registerAction('afterResponse', sinon.stub().returns({body: 'test body', metadata, encoding: 'utf8'})); } } @@ -272,6 +272,7 @@ describe('Scraper', () => { should(r.getUrl()).be.eql('http://example.com'); should(r.getType()).be.eql('html'); should(r.getFilename()).be.eql('generated-filename'); + should(r.getEncoding()).be.eql('utf8'); should(r.metadata).be.eql(metadata); }); }); From cfd5ac6751cb83459089d906986f70d8bbb6990e Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 18:59:24 +0100 Subject: [PATCH 04/16] fix: tests now pass --- lib/request.js | 6 ++++-- test/unit/scraper-init-test.js | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/request.js b/lib/request.js index 781b7479..4034c623 100644 --- a/lib/request.js +++ b/lib/request.js @@ -20,11 +20,11 @@ function transformResult (response) { if (response && typeof response.headers === 'object' && typeof response.body !== 'undefined') { encoding = response.headers['content-type'] && response.headers['content-type'].includes('utf-8') ? 'utf8' : 'binary'; result = response.body.toString(encoding); + } else if (response instanceof Buffer) { + result = response.toString('binary'); } switch (true) { - case result === null: - return null; case typeof result === 'string': return { body: result, @@ -37,6 +37,8 @@ function transformResult (response) { metadata: result.metadata || null, encoding: result.encoding || encoding || 'binary' }; + case result === null: + return null; default: throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); } diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 9612e180..080c5976 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -121,7 +121,7 @@ describe('Scraper initialization', function () { s.options.request.should.containEql({ throwHttpErrors: false, - encoding: 'binary', + responseType: 'buffer', decompress: true, https: { rejectUnauthorized: false @@ -143,7 +143,7 @@ describe('Scraper initialization', function () { s.options.request.should.eql({ throwHttpErrors: true, - encoding: 'binary', + responseType: 'buffer', decompress: true, https: { rejectUnauthorized: false From 05b7dc46cabdee68cc272461d0eadf9de9500071 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 19:28:02 +0100 Subject: [PATCH 05/16] fix: testing and code complexity --- lib/request.js | 11 ++++++++++- test/unit/request-test.js | 7 ++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/request.js b/lib/request.js index 4034c623..76281f3a 100644 --- a/lib/request.js +++ b/lib/request.js @@ -10,7 +10,7 @@ function defaultResponseHandler ({response}) { return Promise.resolve(response); } -function transformResult (response) { +function normalizeResponse(response) { // Response could be a response object or it could be a raw string/binary/whatever so we need to be safe and try // to handle when it's not a response object. @@ -24,6 +24,15 @@ function transformResult (response) { result = response.toString('binary'); } + return { + result, + encoding + }; +} + +function transformResult (response) { + const {result, encoding} = normalizeResponse(response); + switch (true) { case typeof result === 'string': return { diff --git a/test/unit/request-test.js b/test/unit/request-test.js index 2ac0f229..9026a642 100644 --- a/test/unit/request-test.js +++ b/test/unit/request-test.js @@ -66,12 +66,14 @@ describe('request', () => { nock(url).get('/').reply(200, 'TEST BODY'); const handlerStub = sinon.stub().resolves({ body: 'a', - metadata: 'b' + metadata: 'b', + encoding: 'utf8' }); return request.get({url, afterResponse: handlerStub}).then((data) => { should(data.body).be.eql('a'); should(data.metadata).be.eql('b'); + should(data.encoding).be.eql('utf8'); }); }); @@ -85,6 +87,7 @@ describe('request', () => { return request.get({url, afterResponse: handlerStub}).then((data) => { should(data.body).be.eql('a'); should(data.metadata).be.eql(null); + should(data.encoding).be.eql('binary'); }); }); @@ -124,6 +127,7 @@ describe('request', () => { data.url.should.be.eql('http://www.google.com/'); data.body.should.be.eql('Hello from Google!'); data.mimeType.should.be.eql('text/html'); + data.encoding.should.be.eql('utf8'); }); }); @@ -135,6 +139,7 @@ describe('request', () => { data.should.have.properties(['url', 'body', 'mimeType']); data.url.should.be.eql('http://www.google.com/'); data.body.should.be.eql('Hello from Google!'); + data.encoding.should.be.eql('binary'); should(data.mimeType).be.eql(null); }); }); From c8c37986d677dff27a5eeccf043e389a09b9afd8 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 19:32:32 +0100 Subject: [PATCH 06/16] fix: code climate --- lib/request.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/request.js b/lib/request.js index 76281f3a..e33f0294 100644 --- a/lib/request.js +++ b/lib/request.js @@ -18,7 +18,9 @@ function normalizeResponse(response) { let encoding = 'binary'; if (response && typeof response.headers === 'object' && typeof response.body !== 'undefined') { - encoding = response.headers['content-type'] && response.headers['content-type'].includes('utf-8') ? 'utf8' : 'binary'; + const contentTypeHeader = response.headers['content-type']; + + encoding = contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; result = response.body.toString(encoding); } else if (response instanceof Buffer) { result = response.toString('binary'); From 1e1cacff67cb75ece8f292d877063efd50e0253c Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 19:35:19 +0100 Subject: [PATCH 07/16] fix: change comments --- lib/request.js | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/request.js b/lib/request.js index e33f0294..2229e0a8 100644 --- a/lib/request.js +++ b/lib/request.js @@ -10,10 +10,14 @@ function defaultResponseHandler ({response}) { return Promise.resolve(response); } -function normalizeResponse(response) { - // Response could be a response object or it could be a raw string/binary/whatever so we need to be safe and try - // to handle when it's not a response object. - +/** + * Normalizes the request response so to maintain compatibility with the old API + * while adding the ability to extract the encoding information from the response. + * + * @param response - Node Response, Buffer, string, or plain object. + * @returns result and encoding. + */ +function normalizeResponse (response) { let result = response; let encoding = 'binary'; From 0ff0dac02246348ed2149d1f6e0e11d1b382699d Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Sat, 18 Jun 2022 19:43:31 +0100 Subject: [PATCH 08/16] fix: simplify code complexity --- lib/request.js | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/lib/request.js b/lib/request.js index 2229e0a8..7cd36be4 100644 --- a/lib/request.js +++ b/lib/request.js @@ -10,6 +10,16 @@ function defaultResponseHandler ({response}) { return Promise.resolve(response); } +function getEncoding (response) { + if (response && typeof response.headers === 'object') { + const contentTypeHeader = response.headers['content-type']; + + return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; + } + + return 'binary'; +} + /** * Normalizes the request response so to maintain compatibility with the old API * while adding the ability to extract the encoding information from the response. @@ -19,15 +29,16 @@ function defaultResponseHandler ({response}) { */ function normalizeResponse (response) { let result = response; - let encoding = 'binary'; - - if (response && typeof response.headers === 'object' && typeof response.body !== 'undefined') { - const contentTypeHeader = response.headers['content-type']; + let encoding = getEncoding(response); - encoding = contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; - result = response.body.toString(encoding); + if (response) { + if (response.body instanceof Buffer) { + result = response.body.toString(encoding); + } else { + result = response.body.toString(); + } } else if (response instanceof Buffer) { - result = response.toString('binary'); + result = response.toString(encoding); } return { From 8bfe5e956879dc4617df438b51ee08b881e142dc Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Mon, 20 Jun 2022 13:26:14 +0100 Subject: [PATCH 09/16] refactor: simplify logic and add more testing --- lib/request.js | 84 +++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 45 deletions(-) diff --git a/lib/request.js b/lib/request.js index 7cd36be4..ed5ce2d8 100644 --- a/lib/request.js +++ b/lib/request.js @@ -1,6 +1,6 @@ import got from 'got'; import logger from './logger.js'; -import { extend, isPlainObject } from './utils/index.js'; +import { extend } from './utils/index.js'; function getMimeType (contentType) { return contentType ? contentType.split(';')[0] : null; @@ -15,59 +15,51 @@ function getEncoding (response) { const contentTypeHeader = response.headers['content-type']; return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; + } else if (response && response.encoding) { + return response.encoding; } return 'binary'; } -/** - * Normalizes the request response so to maintain compatibility with the old API - * while adding the ability to extract the encoding information from the response. - * - * @param response - Node Response, Buffer, string, or plain object. - * @returns result and encoding. - */ -function normalizeResponse (response) { - let result = response; - let encoding = getEncoding(response); - - if (response) { - if (response.body instanceof Buffer) { - result = response.body.toString(encoding); - } else { - result = response.body.toString(); - } - } else if (response instanceof Buffer) { - result = response.toString(encoding); +function throwTypeError(result) { + let type = typeof result; + + if (result instanceof Error) { + throw result; + } else if (type === 'object' && Array.isArray(result)) { + type = 'array'; } - return { - result, - encoding - }; + throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`); } -function transformResult (response) { - const {result, encoding} = normalizeResponse(response); - - switch (true) { - case typeof result === 'string': - return { - body: result, - metadata: null, - encoding - }; - case isPlainObject(result): - return { - body: result.body, - metadata: result.metadata || null, - encoding: result.encoding || encoding || 'binary' - }; - case result === null: - return null; - default: - throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); +function transformResult (result) { + let encoding = getEncoding(result); + + // First normalize down to find where the data should be. + let data = result; + if (result && typeof result === 'object' && 'body' in result) { + data = result.body; } + + // Then stringify it. + let body = null; + if (data instanceof Buffer) { + body = data.toString(encoding); + } else if (typeof data === 'string') { + body = data; + } else if (data === null || data === undefined) { + return null; + } else { + throwTypeError(result); + } + + return { + body, + encoding, + metadata: result.metadata || data.metadata || null + }; } async function getRequest ({url, referer, options = {}, afterResponse = defaultResponseHandler}) { @@ -97,5 +89,7 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR } export default { - get: getRequest + get: getRequest, + getEncoding, + transformResult }; From b6c202a8d4208139af23dcb9b3d8f3368c6df023 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Mon, 20 Jun 2022 13:34:05 +0100 Subject: [PATCH 10/16] fix: reduce cognitive complexity --- lib/request.js | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/lib/request.js b/lib/request.js index ed5ce2d8..d135ddda 100644 --- a/lib/request.js +++ b/lib/request.js @@ -10,21 +10,27 @@ function defaultResponseHandler ({response}) { return Promise.resolve(response); } -function getEncoding (response) { - if (response && typeof response.headers === 'object') { - const contentTypeHeader = response.headers['content-type']; +function extractEncodingFromHeader (headers) { + const contentTypeHeader = headers['content-type']; + + return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; +} - return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; - } else if (response && response.encoding) { - return response.encoding; +function getEncoding (response) { + if (typeof response === 'object') { + if (typeof response.headers === 'object') { + return extractEncodingFromHeader(response.headers); + } else if (response.encoding) { + return response.encoding; + } } return 'binary'; } -function throwTypeError(result) { +function throwTypeError (result) { let type = typeof result; - + if (result instanceof Error) { throw result; } else if (type === 'object' && Array.isArray(result)) { @@ -43,14 +49,17 @@ function transformResult (result) { data = result.body; } + // Check for no data + if (data === null || data === undefined) { + return null; + } + // Then stringify it. let body = null; if (data instanceof Buffer) { body = data.toString(encoding); } else if (typeof data === 'string') { body = data; - } else if (data === null || data === undefined) { - return null; } else { throwTypeError(result); } From a971fbe8e06dfc89e260b94f44707d577f10b994 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Mon, 20 Jun 2022 13:36:37 +0100 Subject: [PATCH 11/16] fix: more cognitive complexity --- lib/request.js | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/request.js b/lib/request.js index d135ddda..6e6ffc76 100644 --- a/lib/request.js +++ b/lib/request.js @@ -40,15 +40,19 @@ function throwTypeError (result) { throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`); } -function transformResult (result) { - let encoding = getEncoding(result); - - // First normalize down to find where the data should be. +function getData(result) { let data = result; if (result && typeof result === 'object' && 'body' in result) { data = result.body; } + return data; +} + +function transformResult (result) { + const encoding = getEncoding(result); + const data = getData(result); + // Check for no data if (data === null || data === undefined) { return null; From 26134d0c98dcaca547931b6ce6de1795f810781f Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Mon, 20 Jun 2022 13:38:50 +0100 Subject: [PATCH 12/16] chore: damn space --- lib/request.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/request.js b/lib/request.js index 6e6ffc76..54da2a11 100644 --- a/lib/request.js +++ b/lib/request.js @@ -40,7 +40,7 @@ function throwTypeError (result) { throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`); } -function getData(result) { +function getData (result) { let data = result; if (result && typeof result === 'object' && 'body' in result) { data = result.body; From 2a7072c244dc4fba5611a714a2962908f3ec5d0b Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Mon, 20 Jun 2022 13:41:14 +0100 Subject: [PATCH 13/16] fix: broke a test --- lib/request.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/request.js b/lib/request.js index 54da2a11..f66ce29c 100644 --- a/lib/request.js +++ b/lib/request.js @@ -17,8 +17,8 @@ function extractEncodingFromHeader (headers) { } function getEncoding (response) { - if (typeof response === 'object') { - if (typeof response.headers === 'object') { + if (response && typeof response === 'object') { + if (response.headers && typeof response.headers === 'object') { return extractEncodingFromHeader(response.headers); } else if (response.encoding) { return response.encoding; From ea6d3b2df0cff4112de64d5394d860cdbae8741a Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Mon, 20 Jun 2022 14:02:17 +0100 Subject: [PATCH 14/16] chore: update docs --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2fb770fa..8da35183 100644 --- a/README.md +++ b/README.md @@ -326,8 +326,12 @@ Parameters - object which includes: Should return resolved `Promise` if resource should be saved or rejected with Error `Promise` if it should be skipped. Promise should be resolved with: -* `string` which contains response body -* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. +* the `response` object with the `body` modified in place as necessary. +* or object with properties + * `body` (response body, string) + * `encoding` (`binary` or `utf8`) used to save the file, binary used by default. + * `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. +* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem. If multiple actions `afterResponse` added - scraper will use result from last one. ```javascript @@ -342,7 +346,8 @@ registerAction('afterResponse', ({response}) => { metadata: { headers: response.headers, someOtherData: [ 1, 2, 3 ] - } + }, + encoding: 'utf8' } } }); From 6aca94c9e1cf6b0734440df0fef9645bacefdf13 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Wed, 22 Jun 2022 15:11:03 +0100 Subject: [PATCH 15/16] test: re-enable utf8 test --- test/functional/encoding/hieroglyphs.test.js | 28 ++++++++++---------- test/functional/encoding/mocks/index.html | 1 + 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/hieroglyphs.test.js index 1ee31973..57f5004d 100644 --- a/test/functional/encoding/hieroglyphs.test.js +++ b/test/functional/encoding/hieroglyphs.test.js @@ -1,13 +1,12 @@ import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/encoding/.tmp'; const mockDirname = './test/functional/encoding/mocks'; -// TODO: enable test when encoding issue is fixed -xdescribe('Functional: Korean characters are properly encoded/decoded', function() { +describe('Functional: UTF8 characters are properly encoded/decoded', () => { const options = { urls: [ 'http://example.com/', @@ -16,27 +15,28 @@ xdescribe('Functional: Korean characters are properly encoded/decoded', function ignoreErrors: false }; - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); beforeEach(() => { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'}); }); - it('should save the page in the same data as it was originally', () => { - return scrape(options).then(function(result) { - const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString(); - scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); - scrapedIndex.should.be.containEql('
Слава Україні!
'); - scrapedIndex.should.be.containEql('
加入网站
'); - }); + it('should save the page in the same data as it was originally', async () => { + await scrape(options); + + const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' }); + scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); + scrapedIndex.should.be.containEql('
Слава Україні!
'); + scrapedIndex.should.be.containEql('
加入网站
'); + scrapedIndex.should.be.containEql('
Обладнання та ПЗ
'); }); }); diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/index.html index 8d724cc2..8874cc53 100644 --- a/test/functional/encoding/mocks/index.html +++ b/test/functional/encoding/mocks/index.html @@ -8,5 +8,6 @@
저는 7년 동안 한국에서 살았어요.
Слава Україні!
加入网站
+
Обладнання та ПЗ
From c6711cb353750ca3405014b70613464e2f010e30 Mon Sep 17 00:00:00 2001 From: Paul Hawxby Date: Tue, 21 Jun 2022 15:54:07 +0100 Subject: [PATCH 16/16] chore: update request test --- test/unit/request-test.js | 143 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/test/unit/request-test.js b/test/unit/request-test.js index 9026a642..80db6cc0 100644 --- a/test/unit/request-test.js +++ b/test/unit/request-test.js @@ -144,3 +144,146 @@ describe('request', () => { }); }); }); + +describe('get encoding', () => { + it('should return binary by default', () => { + const result = request.getEncoding(null); + + should(result).be.eql('binary'); + }); + + it('should return binary when no content-type header supplies', () => { + const result = request.getEncoding({ + headers: {} + }); + + should(result).be.eql('binary'); + }); + + it('should return binary when content type header doesn\'t include utf-8', () => { + const result = request.getEncoding({ + headers: {} + }); + + should(result).be.eql('binary'); + }); + + it('should return binary when content type header doesn\'t include utf-8', () => { + const result = request.getEncoding({ + headers: { + 'content-type': 'text/html' + } + }); + + should(result).be.eql('binary'); + }); + + it('should return utf8 when content type includes utf-8', () => { + const result = request.getEncoding({ + headers: { + 'content-type': 'text/html; charset=utf-8' + } + }); + + should(result).be.eql('utf8'); + }); + + it('should return utf8 response object includes it', () => { + const result = request.getEncoding({ + encoding: 'utf8' + }); + + should(result).be.eql('utf8'); + }); +}); + +describe('transformResult', () => { + it('should throw with weird shaped response', () => { + try { + request.transformResult([1,2,3]); + + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('Wrong response handler result. Expected string or object, but received array'); + } + }); + + it('should pass through error', () => { + try { + request.transformResult(new Error('Oh no')); + + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('Oh no'); + } + }); + + it('should throw with boolean input', () => { + try { + request.transformResult(true); + + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('Wrong response handler result. Expected string or object, but received boolean'); + } + }); + + it('should handle object', () => { + const result = request.transformResult({ + body: 'SOME BODY', + encoding: 'utf8', + metadata: { foo: 'bar' } + }); + + should(result).have.property('body', 'SOME BODY'); + should(result).have.property('encoding', 'utf8'); + should(result).have.property('metadata', { foo: 'bar' }); + }); + + it('should handle object with empty body string', () => { + const result = request.transformResult({ + body: '', + encoding: 'utf8', + }); + + should(result).have.property('body', ''); + should(result).have.property('encoding', 'utf8'); + should(result).have.property('metadata', null); + }); + + it('should handle object with defaults and buffer body', () => { + const result = request.transformResult({ + body: Buffer.from('SOME BODY'), + }); + + should(result).have.property('body', 'SOME BODY'); + should(result).have.property('encoding', 'binary'); + should(result).have.property('metadata', null); + }); + + it('should handle raw string input', () => { + const result = request.transformResult('SOME BODY'); + + should(result).have.property('body', 'SOME BODY'); + should(result).have.property('encoding', 'binary'); + should(result).have.property('metadata', null); + }); + + it('should handle null input', () => { + const result = request.transformResult(null); + + should(result).eqls(null); + }); + + it('should handle undefined input', () => { + const result = request.transformResult(undefined); + + should(result).eqls(null); + }); +});