diff --git a/README.md b/README.md index 2fb770fa..8da35183 100644 --- a/README.md +++ b/README.md @@ -326,8 +326,12 @@ Parameters - object which includes: Should return resolved `Promise` if resource should be saved or rejected with Error `Promise` if it should be skipped. Promise should be resolved with: -* `string` which contains response body -* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. +* the `response` object with the `body` modified in place as necessary. +* or object with properties + * `body` (response body, string) + * `encoding` (`binary` or `utf8`) used to save the file, binary used by default. + * `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. +* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem. If multiple actions `afterResponse` added - scraper will use result from last one. ```javascript @@ -342,7 +346,8 @@ registerAction('afterResponse', ({response}) => { metadata: { headers: response.headers, someOtherData: [ 1, 2, 3 ] - } + }, + encoding: 'utf8' } } }); diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 1e79fe63..14e00077 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -48,7 +48,7 @@ const config = { ], request: { throwHttpErrors: false, - encoding: 'binary', + responseType: 'buffer', //cookieJar: true, decompress: true, headers: { diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js index 0e7be17c..b5cfab02 100644 --- a/lib/plugins/save-resource-to-fs-plugin.js +++ b/lib/plugins/save-resource-to-fs-plugin.js @@ -20,7 +20,7 @@ class SaveResourceToFileSystemPlugin { registerAction('saveResource', async ({resource}) => { const filename = path.join(absoluteDirectoryPath, resource.getFilename()); const text = resource.getText(); - await fs.outputFile(filename, text, { encoding: 'binary' }); + await fs.outputFile(filename, text, { encoding: resource.getEncoding() }); loadedResources.push(resource); }); diff --git a/lib/request.js b/lib/request.js index 4ea4e76b..f66ce29c 100644 --- a/lib/request.js +++ b/lib/request.js @@ -1,32 +1,78 @@ import got from 'got'; import logger from './logger.js'; -import { extend, isPlainObject } from './utils/index.js'; +import { extend } from './utils/index.js'; function getMimeType (contentType) { return contentType ? contentType.split(';')[0] : null; } function defaultResponseHandler ({response}) { - return Promise.resolve(response.body); + return Promise.resolve(response); +} + +function extractEncodingFromHeader (headers) { + const contentTypeHeader = headers['content-type']; + + return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; +} + +function getEncoding (response) { + if (response && typeof response === 'object') { + if (response.headers && typeof response.headers === 'object') { + return extractEncodingFromHeader(response.headers); + } else if (response.encoding) { + return response.encoding; + } + } + + return 'binary'; +} + +function throwTypeError (result) { + let type = typeof result; + + if (result instanceof Error) { + throw result; + } else if (type === 'object' && Array.isArray(result)) { + type = 'array'; + } + + throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`); +} + +function getData (result) { + let data = result; + if (result && typeof result === 'object' && 'body' in result) { + data = result.body; + } + + return data; } function transformResult (result) { - switch (true) { - case typeof result === 'string': - return { - body: result, - metadata: null - }; - case isPlainObject(result): - return { - body: result.body, - metadata: result.metadata || null - }; - case result === null: - return null; - default: - throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); + const encoding = getEncoding(result); + const data = getData(result); + + // Check for no data + if (data === null || data === undefined) { + return null; } + + // Then stringify it. + let body = null; + if (data instanceof Buffer) { + body = data.toString(encoding); + } else if (typeof data === 'string') { + body = data; + } else { + throwTypeError(result); + } + + return { + body, + encoding, + metadata: result.metadata || data.metadata || null + }; } async function getRequest ({url, referer, options = {}, afterResponse = defaultResponseHandler}) { @@ -50,10 +96,13 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR url: response.url, mimeType: getMimeType(response.headers['content-type']), body: responseHandlerResult.body, - metadata: responseHandlerResult.metadata + metadata: responseHandlerResult.metadata, + encoding: responseHandlerResult.encoding }; } export default { - get: getRequest + get: getRequest, + getEncoding, + transformResult }; diff --git a/lib/resource.js b/lib/resource.js index f6acd65d..ae78886c 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -12,6 +12,7 @@ class Resource { this.children = []; this.saved = false; + this.encoding = 'binary'; } createChild (url, filename) { @@ -69,6 +70,14 @@ class Resource { return this.type; } + setEncoding (encoding) { + this.encoding = encoding; + } + + getEncoding () { + return this.encoding; + } + isHtml () { return this.getType() === types.html; } diff --git a/lib/scraper.js b/lib/scraper.js index 040a9cd9..e14a3d4d 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -170,6 +170,7 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } + resource.setEncoding(responseData.encoding); resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/hieroglyphs.test.js index 1ee31973..57f5004d 100644 --- a/test/functional/encoding/hieroglyphs.test.js +++ b/test/functional/encoding/hieroglyphs.test.js @@ -1,13 +1,12 @@ import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/encoding/.tmp'; const mockDirname = './test/functional/encoding/mocks'; -// TODO: enable test when encoding issue is fixed -xdescribe('Functional: Korean characters are properly encoded/decoded', function() { +describe('Functional: UTF8 characters are properly encoded/decoded', () => { const options = { urls: [ 'http://example.com/', @@ -16,27 +15,28 @@ xdescribe('Functional: Korean characters are properly encoded/decoded', function ignoreErrors: false }; - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); beforeEach(() => { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'}); }); - it('should save the page in the same data as it was originally', () => { - return scrape(options).then(function(result) { - const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString(); - scrapedIndex.should.be.containEql('