Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: non-english char encoding #496

Merged
merged 16 commits into from
Jun 23, 2022
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,12 @@ Parameters - object which includes:

Should return resolved `Promise` if resource should be saved or rejected with Error `Promise` if it should be skipped.
Promise should be resolved with:
* `string` which contains response body
* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
* the `response` object with the `body` modified in place as necessary.
* or object with properties
* `body` (response body, string)
* `encoding` (`binary` or `utf8`) used to save the file, binary used by default.
* `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we bump major version for this change then we can get rid of string case here. That will simplify the code a lot


If multiple actions `afterResponse` added - scraper will use result from last one.
```javascript
Expand All @@ -342,7 +346,8 @@ registerAction('afterResponse', ({response}) => {
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
}
},
encoding: 'utf8'
}
}
});
Expand Down
2 changes: 1 addition & 1 deletion lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const config = {
],
request: {
throwHttpErrors: false,
encoding: 'binary',
responseType: 'buffer',
//cookieJar: true,
decompress: true,
headers: {
Expand Down
2 changes: 1 addition & 1 deletion lib/plugins/save-resource-to-fs-plugin.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class SaveResourceToFileSystemPlugin {
registerAction('saveResource', async ({resource}) => {
const filename = path.join(absoluteDirectoryPath, resource.getFilename());
const text = resource.getText();
await fs.outputFile(filename, text, { encoding: 'binary' });
await fs.outputFile(filename, text, { encoding: resource.getEncoding() });
loadedResources.push(resource);
});

Expand Down
87 changes: 68 additions & 19 deletions lib/request.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,78 @@
import got from 'got';
import logger from './logger.js';
import { extend, isPlainObject } from './utils/index.js';
import { extend } from './utils/index.js';

function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}

function defaultResponseHandler ({response}) {
return Promise.resolve(response.body);
return Promise.resolve(response);
}

function extractEncodingFromHeader (headers) {
const contentTypeHeader = headers['content-type'];

return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
}

function getEncoding (response) {
if (response && typeof response === 'object') {
if (response.headers && typeof response.headers === 'object') {
return extractEncodingFromHeader(response.headers);
} else if (response.encoding) {
return response.encoding;
}
}

return 'binary';
}

function throwTypeError (result) {
let type = typeof result;

if (result instanceof Error) {
throw result;
} else if (type === 'object' && Array.isArray(result)) {
type = 'array';
}

throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`);
}

function getData (result) {
let data = result;
if (result && typeof result === 'object' && 'body' in result) {
data = result.body;
}

return data;
}

function transformResult (result) {
switch (true) {
case typeof result === 'string':
return {
body: result,
metadata: null
};
case isPlainObject(result):
return {
body: result.body,
metadata: result.metadata || null
};
case result === null:
return null;
default:
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
const encoding = getEncoding(result);
const data = getData(result);

// Check for no data
if (data === null || data === undefined) {
return null;
}

// Then stringify it.
let body = null;
if (data instanceof Buffer) {
body = data.toString(encoding);
} else if (typeof data === 'string') {
body = data;
} else {
throwTypeError(result);
}

return {
body,
encoding,
metadata: result.metadata || data.metadata || null
};
}

async function getRequest ({url, referer, options = {}, afterResponse = defaultResponseHandler}) {
Expand All @@ -50,10 +96,13 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR
url: response.url,
mimeType: getMimeType(response.headers['content-type']),
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata
metadata: responseHandlerResult.metadata,
encoding: responseHandlerResult.encoding
};
}

export default {
get: getRequest
get: getRequest,
getEncoding,
transformResult
};
9 changes: 9 additions & 0 deletions lib/resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Resource {
this.children = [];

this.saved = false;
this.encoding = 'binary';
}

createChild (url, filename) {
Expand Down Expand Up @@ -69,6 +70,14 @@ class Resource {
return this.type;
}

setEncoding (encoding) {
this.encoding = encoding;
}

getEncoding () {
return this.encoding;
}

isHtml () {
return this.getType() === types.html;
}
Expand Down
1 change: 1 addition & 0 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ class Scraper {
self.requestedResourcePromises.set(responseData.url, requestPromise);
}

resource.setEncoding(responseData.encoding);
resource.setType(getTypeByMime(responseData.mimeType));

const { filename } = await self.runActions('generateFilename', { resource, responseData });
Expand Down
28 changes: 14 additions & 14 deletions test/functional/encoding/hieroglyphs.test.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import '../../utils/assertions.js';
import nock from 'nock';
import fs from 'fs-extra';
import fs from 'fs/promises';
import scrape from 'website-scraper';

const testDirname = './test/functional/encoding/.tmp';
const mockDirname = './test/functional/encoding/mocks';

// TODO: enable test when encoding issue is fixed
xdescribe('Functional: Korean characters are properly encoded/decoded', function() {
describe('Functional: UTF8 characters are properly encoded/decoded', () => {
const options = {
urls: [
'http://example.com/',
Expand All @@ -16,27 +15,28 @@ xdescribe('Functional: Korean characters are properly encoded/decoded', function
ignoreErrors: false
};

beforeEach(function() {
beforeEach(() => {
nock.cleanAll();
nock.disableNetConnect();
});

afterEach(function() {
afterEach(async () => {
nock.cleanAll();
nock.enableNetConnect();
fs.removeSync(testDirname);
await fs.rm(testDirname, { recursive: true, force: true });
});

beforeEach(() => {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'});
});

it('should save the page in the same data as it was originally', () => {
return scrape(options).then(function(result) {
const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString();
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
});
it('should save the page in the same data as it was originally', async () => {
await scrape(options);

const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
});
});
1 change: 1 addition & 0 deletions test/functional/encoding/mocks/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>
<div id="special-characters-ukrainian">Слава Україні!</div>
<div id="special-characters-chinese">加入网站</div>
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
</body>
</html>
Loading