diff --git a/README.md b/README.md index 2fb770fa..8da35183 100644 --- a/README.md +++ b/README.md @@ -326,8 +326,12 @@ Parameters - object which includes: Should return resolved `Promise` if resource should be saved or rejected with Error `Promise` if it should be skipped. Promise should be resolved with: -* `string` which contains response body -* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. +* the `response` object with the `body` modified in place as necessary. +* or object with properties + * `body` (response body, string) + * `encoding` (`binary` or `utf8`) used to save the file, binary used by default. + * `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. +* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem. If multiple actions `afterResponse` added - scraper will use result from last one. ```javascript @@ -342,7 +346,8 @@ registerAction('afterResponse', ({response}) => { metadata: { headers: response.headers, someOtherData: [ 1, 2, 3 ] - } + }, + encoding: 'utf8' } } }); diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 1e79fe63..14e00077 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -48,7 +48,7 @@ const config = { ], request: { throwHttpErrors: false, - encoding: 'binary', + responseType: 'buffer', //cookieJar: true, decompress: true, headers: { diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js index 0e7be17c..b5cfab02 100644 --- a/lib/plugins/save-resource-to-fs-plugin.js +++ b/lib/plugins/save-resource-to-fs-plugin.js @@ -20,7 +20,7 @@ class SaveResourceToFileSystemPlugin { registerAction('saveResource', async ({resource}) => { const filename = path.join(absoluteDirectoryPath, resource.getFilename()); const text = resource.getText(); - await fs.outputFile(filename, text, { encoding: 'binary' }); + await fs.outputFile(filename, text, { encoding: resource.getEncoding() }); loadedResources.push(resource); }); diff --git a/lib/request.js b/lib/request.js index 4ea4e76b..f66ce29c 100644 --- a/lib/request.js +++ b/lib/request.js @@ -1,32 +1,78 @@ import got from 'got'; import logger from './logger.js'; -import { extend, isPlainObject } from './utils/index.js'; +import { extend } from './utils/index.js'; function getMimeType (contentType) { return contentType ? contentType.split(';')[0] : null; } function defaultResponseHandler ({response}) { - return Promise.resolve(response.body); + return Promise.resolve(response); +} + +function extractEncodingFromHeader (headers) { + const contentTypeHeader = headers['content-type']; + + return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; +} + +function getEncoding (response) { + if (response && typeof response === 'object') { + if (response.headers && typeof response.headers === 'object') { + return extractEncodingFromHeader(response.headers); + } else if (response.encoding) { + return response.encoding; + } + } + + return 'binary'; +} + +function throwTypeError (result) { + let type = typeof result; + + if (result instanceof Error) { + throw result; + } else if (type === 'object' && Array.isArray(result)) { + type = 'array'; + } + + throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`); +} + +function getData (result) { + let data = result; + if (result && typeof result === 'object' && 'body' in result) { + data = result.body; + } + + return data; } function transformResult (result) { - switch (true) { - case typeof result === 'string': - return { - body: result, - metadata: null - }; - case isPlainObject(result): - return { - body: result.body, - metadata: result.metadata || null - }; - case result === null: - return null; - default: - throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); + const encoding = getEncoding(result); + const data = getData(result); + + // Check for no data + if (data === null || data === undefined) { + return null; } + + // Then stringify it. + let body = null; + if (data instanceof Buffer) { + body = data.toString(encoding); + } else if (typeof data === 'string') { + body = data; + } else { + throwTypeError(result); + } + + return { + body, + encoding, + metadata: result.metadata || data.metadata || null + }; } async function getRequest ({url, referer, options = {}, afterResponse = defaultResponseHandler}) { @@ -50,10 +96,13 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR url: response.url, mimeType: getMimeType(response.headers['content-type']), body: responseHandlerResult.body, - metadata: responseHandlerResult.metadata + metadata: responseHandlerResult.metadata, + encoding: responseHandlerResult.encoding }; } export default { - get: getRequest + get: getRequest, + getEncoding, + transformResult }; diff --git a/lib/resource.js b/lib/resource.js index f6acd65d..ae78886c 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -12,6 +12,7 @@ class Resource { this.children = []; this.saved = false; + this.encoding = 'binary'; } createChild (url, filename) { @@ -69,6 +70,14 @@ class Resource { return this.type; } + setEncoding (encoding) { + this.encoding = encoding; + } + + getEncoding () { + return this.encoding; + } + isHtml () { return this.getType() === types.html; } diff --git a/lib/scraper.js b/lib/scraper.js index 040a9cd9..e14a3d4d 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -170,6 +170,7 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } + resource.setEncoding(responseData.encoding); resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/hieroglyphs.test.js index 1ee31973..57f5004d 100644 --- a/test/functional/encoding/hieroglyphs.test.js +++ b/test/functional/encoding/hieroglyphs.test.js @@ -1,13 +1,12 @@ import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/encoding/.tmp'; const mockDirname = './test/functional/encoding/mocks'; -// TODO: enable test when encoding issue is fixed -xdescribe('Functional: Korean characters are properly encoded/decoded', function() { +describe('Functional: UTF8 characters are properly encoded/decoded', () => { const options = { urls: [ 'http://example.com/', @@ -16,27 +15,28 @@ xdescribe('Functional: Korean characters are properly encoded/decoded', function ignoreErrors: false }; - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); beforeEach(() => { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'}); }); - it('should save the page in the same data as it was originally', () => { - return scrape(options).then(function(result) { - const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString(); - scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); - scrapedIndex.should.be.containEql('
Слава Україні!
'); - scrapedIndex.should.be.containEql('
加入网站
'); - }); + it('should save the page in the same data as it was originally', async () => { + await scrape(options); + + const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' }); + scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); + scrapedIndex.should.be.containEql('
Слава Україні!
'); + scrapedIndex.should.be.containEql('
加入网站
'); + scrapedIndex.should.be.containEql('
Обладнання та ПЗ
'); }); }); diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/index.html index 8d724cc2..8874cc53 100644 --- a/test/functional/encoding/mocks/index.html +++ b/test/functional/encoding/mocks/index.html @@ -8,5 +8,6 @@
저는 7년 동안 한국에서 살았어요.
Слава Україні!
加入网站
+
Обладнання та ПЗ
diff --git a/test/unit/request-test.js b/test/unit/request-test.js index 2ac0f229..80db6cc0 100644 --- a/test/unit/request-test.js +++ b/test/unit/request-test.js @@ -66,12 +66,14 @@ describe('request', () => { nock(url).get('/').reply(200, 'TEST BODY'); const handlerStub = sinon.stub().resolves({ body: 'a', - metadata: 'b' + metadata: 'b', + encoding: 'utf8' }); return request.get({url, afterResponse: handlerStub}).then((data) => { should(data.body).be.eql('a'); should(data.metadata).be.eql('b'); + should(data.encoding).be.eql('utf8'); }); }); @@ -85,6 +87,7 @@ describe('request', () => { return request.get({url, afterResponse: handlerStub}).then((data) => { should(data.body).be.eql('a'); should(data.metadata).be.eql(null); + should(data.encoding).be.eql('binary'); }); }); @@ -124,6 +127,7 @@ describe('request', () => { data.url.should.be.eql('http://www.google.com/'); data.body.should.be.eql('Hello from Google!'); data.mimeType.should.be.eql('text/html'); + data.encoding.should.be.eql('utf8'); }); }); @@ -135,7 +139,151 @@ describe('request', () => { data.should.have.properties(['url', 'body', 'mimeType']); data.url.should.be.eql('http://www.google.com/'); data.body.should.be.eql('Hello from Google!'); + data.encoding.should.be.eql('binary'); should(data.mimeType).be.eql(null); }); }); }); + +describe('get encoding', () => { + it('should return binary by default', () => { + const result = request.getEncoding(null); + + should(result).be.eql('binary'); + }); + + it('should return binary when no content-type header supplies', () => { + const result = request.getEncoding({ + headers: {} + }); + + should(result).be.eql('binary'); + }); + + it('should return binary when content type header doesn\'t include utf-8', () => { + const result = request.getEncoding({ + headers: {} + }); + + should(result).be.eql('binary'); + }); + + it('should return binary when content type header doesn\'t include utf-8', () => { + const result = request.getEncoding({ + headers: { + 'content-type': 'text/html' + } + }); + + should(result).be.eql('binary'); + }); + + it('should return utf8 when content type includes utf-8', () => { + const result = request.getEncoding({ + headers: { + 'content-type': 'text/html; charset=utf-8' + } + }); + + should(result).be.eql('utf8'); + }); + + it('should return utf8 response object includes it', () => { + const result = request.getEncoding({ + encoding: 'utf8' + }); + + should(result).be.eql('utf8'); + }); +}); + +describe('transformResult', () => { + it('should throw with weird shaped response', () => { + try { + request.transformResult([1,2,3]); + + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('Wrong response handler result. Expected string or object, but received array'); + } + }); + + it('should pass through error', () => { + try { + request.transformResult(new Error('Oh no')); + + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('Oh no'); + } + }); + + it('should throw with boolean input', () => { + try { + request.transformResult(true); + + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('Wrong response handler result. Expected string or object, but received boolean'); + } + }); + + it('should handle object', () => { + const result = request.transformResult({ + body: 'SOME BODY', + encoding: 'utf8', + metadata: { foo: 'bar' } + }); + + should(result).have.property('body', 'SOME BODY'); + should(result).have.property('encoding', 'utf8'); + should(result).have.property('metadata', { foo: 'bar' }); + }); + + it('should handle object with empty body string', () => { + const result = request.transformResult({ + body: '', + encoding: 'utf8', + }); + + should(result).have.property('body', ''); + should(result).have.property('encoding', 'utf8'); + should(result).have.property('metadata', null); + }); + + it('should handle object with defaults and buffer body', () => { + const result = request.transformResult({ + body: Buffer.from('SOME BODY'), + }); + + should(result).have.property('body', 'SOME BODY'); + should(result).have.property('encoding', 'binary'); + should(result).have.property('metadata', null); + }); + + it('should handle raw string input', () => { + const result = request.transformResult('SOME BODY'); + + should(result).have.property('body', 'SOME BODY'); + should(result).have.property('encoding', 'binary'); + should(result).have.property('metadata', null); + }); + + it('should handle null input', () => { + const result = request.transformResult(null); + + should(result).eqls(null); + }); + + it('should handle undefined input', () => { + const result = request.transformResult(undefined); + + should(result).eqls(null); + }); +}); diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 9612e180..080c5976 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -121,7 +121,7 @@ describe('Scraper initialization', function () { s.options.request.should.containEql({ throwHttpErrors: false, - encoding: 'binary', + responseType: 'buffer', decompress: true, https: { rejectUnauthorized: false @@ -143,7 +143,7 @@ describe('Scraper initialization', function () { s.options.request.should.eql({ throwHttpErrors: true, - encoding: 'binary', + responseType: 'buffer', decompress: true, https: { rejectUnauthorized: false diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 87ee177c..0cfa780c 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -251,7 +251,7 @@ describe('Scraper', () => { class AddMetadataPlugin { apply (registerAction) { - registerAction('afterResponse', sinon.stub().returns({body: 'test body', metadata})); + registerAction('afterResponse', sinon.stub().returns({body: 'test body', metadata, encoding: 'utf8'})); } } @@ -272,6 +272,7 @@ describe('Scraper', () => { should(r.getUrl()).be.eql('http://example.com'); should(r.getType()).be.eql('html'); should(r.getFilename()).be.eql('generated-filename'); + should(r.getEncoding()).be.eql('utf8'); should(r.metadata).be.eql(metadata); }); });