Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use encoding from resource text #504

Merged
merged 6 commits into from
Aug 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .eslintrc.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
extends: "eslint:recommended"
parserOptions:
ecmaVersion: 8
ecmaVersion: 2020
sourceType: "module"
env:
node: true
Expand Down
20 changes: 15 additions & 5 deletions lib/resource-handler/css/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import CssText from './../path-containers/css-text.js';
import { getCharsetFromCss, updateResourceEncoding } from '../../utils/index.js';

class CssResourceHandler {
constructor (options, methods) {
Expand All @@ -7,12 +8,21 @@ class CssResourceHandler {
this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources);
}

handle (resource) {
async handle (resource) {
prepareToLoad(resource);

const pathContainer = new CssText(resource.getText());
return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) {
resource.setText(updatedText);
return resource;
});

const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources);
resource.setText(updatedText);
return resource;
}
}

function prepareToLoad (resource) {
const charset = getCharsetFromCss(resource.getText());
if (charset && charset === 'utf-8') { // do we need to support more charsets here?
updateResourceEncoding(resource, 'utf8');
}
}

Expand Down
22 changes: 17 additions & 5 deletions lib/resource-handler/html/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import cheerio from 'cheerio';
import { union, getUrl, series } from '../../utils/index.js';
import { union, getUrl, series, updateResourceEncoding } from '../../utils/index.js';
import logger from '../../logger.js';
import HtmlSourceElement from './html-source-element.js';

Expand All @@ -23,9 +23,8 @@ class HtmlResourceHandler {
}

async handle (resource) {
prepareToLoad(resource);
const $ = loadTextToCheerio(resource.getText());
prepareToLoad($, resource);

const sourceRulesLoadPromises = this.allSources.map(
rule => this.loadResourcesForRule.bind(this, $, resource, rule)
);
Expand Down Expand Up @@ -68,16 +67,29 @@ class HtmlResourceHandler {
}
}

function prepareToLoad ($, resource) {
$('base').each((i, element) => {
function prepareToLoad (resource) {
const $ = loadTextToCheerio(resource.getText());

$('base[href]').each((i, element) => {
const el = $(element);
const href = el.attr('href');
if (href) {
const newUrl = getUrl(resource.getUrl(), href);
logger.debug(`<base> tag found in resource ${resource}, changing url to ${newUrl}`);
resource.setUrl(newUrl);

el.remove();
resource.setText($.html());
}
});

$('meta[charset]').each((i, element) => {
const el = $(element);
const charset = el.attr('charset')?.toLowerCase();
if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents
updateResourceEncoding(resource, 'utf8');
}
});
}

function loadTextToCheerio (text) {
Expand Down
2 changes: 1 addition & 1 deletion lib/resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class Resource {
}

toString () {
return '{ url: "' + this.getUrl() + '", filename: "' + this.getFilename() + '", depth: ' + this.getDepth() + ' }';
return `{ url: "${this.getUrl()}", filename: "${this.getFilename()}", depth: ${this.getDepth()}, type: "${this.getType()}" }`;
}

isSaved () {
Expand Down
3 changes: 2 additions & 1 deletion lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ class Scraper {
self.requestedResourcePromises.set(responseData.url, requestPromise);
}

resource.setEncoding(responseData.encoding);
resource.setType(getTypeByMime(responseData.mimeType));

const { filename } = await self.runActions('generateFilename', { resource, responseData });
Expand All @@ -185,7 +184,9 @@ class Scraper {
resource.setMetadata(responseData.metadata);
}

resource.setEncoding(responseData.encoding);
resource.setText(responseData.body);

self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
return resource;
}).catch(function handleError (err) {
Expand Down
30 changes: 29 additions & 1 deletion lib/utils/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,32 @@ async function series (promises) {
return results;
}

function getCharsetFromCss (cssText) {
const CHARSET_REGEXP = /(?:@charset\s)(("(.*?)")|('(.*?)'))[\s;]/;
const hasCharset = cssText.startsWith('@charset');

if (hasCharset) {
const charsetMatch = CHARSET_REGEXP.exec(cssText);
const charset = charsetMatch?.[3] || charsetMatch?.[5];
return charset?.toLowerCase() ?? null;
} else {
return null;
}
}

function updateResourceEncoding (resource, encoding) {
logger.debug(`updating encoding of resource ${resource} to ${encoding}`);

const resourceText = resource.getText();

if (resourceText) {
const updatedText = Buffer.from(resourceText, resource.getEncoding()).toString(encoding);
resource.setText(updatedText);
}

resource.setEncoding(encoding);
}

export {
isUrl,
getUrl,
Expand All @@ -181,5 +207,7 @@ export {
extend,
union,
isPlainObject,
series
series,
getCharsetFromCss,
updateResourceEncoding
};
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import scrape from 'website-scraper';
const testDirname = './test/functional/encoding/.tmp';
const mockDirname = './test/functional/encoding/mocks';

describe('Functional: UTF8 characters are properly encoded/decoded', () => {
describe('Functional: encoding', () => {
const options = {
urls: [
'http://example.com/',
Expand All @@ -26,17 +26,29 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => {
await fs.rm(testDirname, { recursive: true, force: true });
});

beforeEach(() => {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'});
it('should save the page with enconding from http response headers', async () => {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/without-charset.html', {'content-type': 'text/html; charset=utf-8'});

await scrape(options);

const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-french">PAR PASSION DU VÉLO</div>');
});

it('should save the page in the same data as it was originally', async () => {
it('should save the page with enconding from html meta tag', async () => {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/with-charset.html', {'content-type': 'text/html'});

await scrape(options);

const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-french">PAR PASSION DU VÉLO</div>');
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
<div id="special-characters-ukrainian">Слава Україні!</div>
<div id="special-characters-chinese">加入网站</div>
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
<div id="special-characters-french">PAR PASSION DU VÉLO</div>
</body>
</html>
13 changes: 13 additions & 0 deletions test/functional/encoding/mocks/without-charset.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Test</title>
</head>
<body>
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>
<div id="special-characters-ukrainian">Слава Україні!</div>
<div id="special-characters-chinese">加入网站</div>
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
<div id="special-characters-french">PAR PASSION DU VÉLO</div>
</body>
</html>
Empty file added test/unit/plugins.test.js
Empty file.
14 changes: 14 additions & 0 deletions test/unit/resource-handler/css.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,25 @@ describe('ResourceHandler: Css', () => {
const downloadChildrenPaths = sinon.stub().resolves('updated text');

const originalResource = new Resource('http://example.com');
originalResource.setText('original css text');
const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths});

return cssHandler.handle(originalResource).then((updatedResource) => {
should(updatedResource).be.equal(originalResource);
should(updatedResource.getText()).be.eql('updated text');
});
});

it('should update resource encoding if charset found', () => {
const downloadChildrenPaths = sinon.stub().resolves('updated text');

const originalResource = new Resource('http://example.com');
originalResource.setText('@charset "UTF-8";');
const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths});

return cssHandler.handle(originalResource).then((updatedResource) => {
should(updatedResource).be.equal(originalResource);
should(updatedResource.getEncoding()).be.eql('utf8');
});
});
});
22 changes: 22 additions & 0 deletions test/unit/resource-handler/html.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,28 @@ describe('ResourceHandler: Html', () => {
});
});

describe('<meta> tag', () => {
beforeEach(() => {
htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths});
});

it('should change encoding of resouce if html contains <meta> with charset attr', async () => {
const html = `
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body></body>
</html>
`;
const resource = new Resource('http://example.com', 'index.html');
resource.setText(html);

await htmlHandler.handle(resource);
should(resource.getEncoding()).eql('utf8');
});
});

it('should not encode text to html entities', () => {
htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths});
const html = `
Expand Down
26 changes: 25 additions & 1 deletion test/unit/utils/utils-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import {
getFilepathFromUrl, getHashFromUrl, getRelativePath,
shortenFilename, prettifyFilename,
isUriSchemaSupported, urlsEqual,
normalizeUrl
normalizeUrl, getCharsetFromCss
} from '../../../lib/utils/index.js';

describe('Utils', function () {
Expand Down Expand Up @@ -231,4 +231,28 @@ describe('Utils', function () {
should(normalizeUrl(malformedUrl)).be.eql(malformedUrl);
});
});

describe('#getCharsetFromCss', () => {
it('should return charset from the beginning of css (inside double quotes)', () => {
const cssText = '@charset "UTF-8"; ';
should(getCharsetFromCss(cssText)).be.eql('utf-8');
});

it('should return charset from the beginning of css (inside single quotes)', () => {
const cssText = `@charset 'UTF-8'; `;
should(getCharsetFromCss(cssText)).be.eql('utf-8');
});

it('should return null if no charset', () => {
const cssText = `h1 {color: red};`;
should(getCharsetFromCss(cssText)).be.eql(null);
});

it('should return null if charset is not valid', () => {
should(getCharsetFromCss('@charset "UTF-8"; ')).be.eql(null);
should(getCharsetFromCss(' @charset "UTF-8"; ')).be.eql(null);
should(getCharsetFromCss('@charset UTF-8;')).be.eql(null);
should(getCharsetFromCss('h1 {color: red}; @charset "UTF-8";')).be.eql(null);
});
});
});