From 62eda0a7f11f7094384d3d2f6c4dc1e7dfcba253 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Fri, 26 Aug 2022 23:53:37 +0200 Subject: [PATCH 1/6] Use enconding from the resource text --- .eslintrc.yml | 2 +- lib/resource-handler/css/index.js | 23 +++++++++++++++++----- lib/resource-handler/html/index.js | 22 +++++++++++++++++---- lib/resource.js | 14 +++++++++++-- lib/utils/index.js | 16 ++++++++++++++- test/unit/resource-handler/css.test.js | 14 +++++++++++++ test/unit/resource-handler/html.test.js | 22 +++++++++++++++++++++ test/unit/utils/utils-test.js | 26 ++++++++++++++++++++++++- 8 files changed, 125 insertions(+), 14 deletions(-) diff --git a/.eslintrc.yml b/.eslintrc.yml index 2eabecba..241b9d56 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -1,6 +1,6 @@ extends: "eslint:recommended" parserOptions: - ecmaVersion: 8 + ecmaVersion: 2020 sourceType: "module" env: node: true diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index e76b60c0..ced2ac72 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -1,4 +1,6 @@ import CssText from './../path-containers/css-text.js'; +import logger from '../../logger.js'; +import { getCharsetFromCss } from '../../utils/index.js'; class CssResourceHandler { constructor (options, methods) { @@ -7,12 +9,23 @@ class CssResourceHandler { this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources); } - handle (resource) { + async handle (resource) { + prepareToLoad(resource); + const pathContainer = new CssText(resource.getText()); - return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) { - resource.setText(updatedText); - return resource; - }); + + const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources); + resource.setText(updatedText); + return resource; + } +} + +function prepareToLoad (resource) { + const charset = getCharsetFromCss(resource.getText()); + if (charset && charset === 'utf-8') { // TODO: support more charsets here? + const enconding = 'utf8'; + logger.debug(`@charset="${charset}" found in ${resource}, changing encoding to ${enconding}`); + resource.setEncoding(enconding); } } diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 4c1d6a7a..315bd9aa 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -23,9 +23,8 @@ class HtmlResourceHandler { } async handle (resource) { + prepareToLoad(resource); const $ = loadTextToCheerio(resource.getText()); - prepareToLoad($, resource); - const sourceRulesLoadPromises = this.allSources.map( rule => this.loadResourcesForRule.bind(this, $, resource, rule) ); @@ -68,16 +67,31 @@ class HtmlResourceHandler { } } -function prepareToLoad ($, resource) { - $('base').each((i, element) => { +function prepareToLoad (resource) { + const $ = loadTextToCheerio(resource.getText()); + + $('base[href]').each((i, element) => { const el = $(element); const href = el.attr('href'); if (href) { const newUrl = getUrl(resource.getUrl(), href); + logger.debug(` tag found in resource ${resource}, changing url to ${newUrl}`); resource.setUrl(newUrl); + el.remove(); + resource.setText($.html()); } }); + + $('meta[charset]').each((i, element) => { + const el = $(element); + const charset = el.attr('charset')?.toLowerCase(); + if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents + const enconding = 'utf8'; + logger.debug(` found in ${resource}, changing encoding to ${enconding}`); + resource.setEncoding(enconding); + } + }); } function loadTextToCheerio (text) { diff --git a/lib/resource.js b/lib/resource.js index ae78886c..6f152506 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -70,8 +70,18 @@ class Resource { return this.type; } - setEncoding (encoding) { - this.encoding = encoding; + setEncoding (newEncoding) { + if (this.encoding === newEncoding) { + return; + } + + // this is needed for cases when enconding changes for a resource with existing text + // e.g. http response headers has one encoding but resource content has another encoding so we need to change the text + if (this.text) { + this.text = Buffer.from(this.text, this.encoding).toString(newEncoding); + } + + this.encoding = newEncoding; } getEncoding () { diff --git a/lib/utils/index.js b/lib/utils/index.js index 97c4b7be..4e380ffe 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -161,6 +161,19 @@ async function series (promises) { return results; } +function getCharsetFromCss (cssText) { + const CHARSET_REGEXP = /(?:@charset\s)(("(.*?)")|('(.*?)'))[\s;]/; + const hasCharset = cssText.startsWith('@charset'); + + if (hasCharset) { + const charsetMatch = CHARSET_REGEXP.exec(cssText); + const charset = charsetMatch?.[3] || charsetMatch?.[5]; + return charset?.toLowerCase() ?? null; + } else { + return null; + } +} + export { isUrl, getUrl, @@ -181,5 +194,6 @@ export { extend, union, isPlainObject, - series + series, + getCharsetFromCss }; diff --git a/test/unit/resource-handler/css.test.js b/test/unit/resource-handler/css.test.js index 13b1eec3..62dcaaeb 100644 --- a/test/unit/resource-handler/css.test.js +++ b/test/unit/resource-handler/css.test.js @@ -8,6 +8,7 @@ describe('ResourceHandler: Css', () => { const downloadChildrenPaths = sinon.stub().resolves('updated text'); const originalResource = new Resource('http://example.com'); + originalResource.setText('original css text'); const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths}); return cssHandler.handle(originalResource).then((updatedResource) => { @@ -15,4 +16,17 @@ describe('ResourceHandler: Css', () => { should(updatedResource.getText()).be.eql('updated text'); }); }); + + it('should update resource encoding if charset found', () => { + const downloadChildrenPaths = sinon.stub().resolves('updated text'); + + const originalResource = new Resource('http://example.com'); + originalResource.setText('@charset "UTF-8";'); + const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths}); + + return cssHandler.handle(originalResource).then((updatedResource) => { + should(updatedResource).be.equal(originalResource); + should(updatedResource.getEncoding()).be.eql('utf-8'); + }); + }); }); diff --git a/test/unit/resource-handler/html.test.js b/test/unit/resource-handler/html.test.js index eefe6a35..0882c4e0 100644 --- a/test/unit/resource-handler/html.test.js +++ b/test/unit/resource-handler/html.test.js @@ -125,6 +125,28 @@ describe('ResourceHandler: Html', () => { }); }); + describe(' tag', () => { + beforeEach(() => { + htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); + }); + + it('should change encoding of resouce if html contains with charset attr', async () => { + const html = ` + + + + + + + `; + const resource = new Resource('http://example.com', 'index.html'); + resource.setText(html); + + await htmlHandler.handle(resource); + should(resource.getEncoding()).eql('utf8'); + }); + }); + it('should not encode text to html entities', () => { htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); const html = ` diff --git a/test/unit/utils/utils-test.js b/test/unit/utils/utils-test.js index 80aab4a0..710bb3bd 100644 --- a/test/unit/utils/utils-test.js +++ b/test/unit/utils/utils-test.js @@ -5,7 +5,7 @@ import { getFilepathFromUrl, getHashFromUrl, getRelativePath, shortenFilename, prettifyFilename, isUriSchemaSupported, urlsEqual, - normalizeUrl + normalizeUrl, getCharsetFromCss } from '../../../lib/utils/index.js'; describe('Utils', function () { @@ -231,4 +231,28 @@ describe('Utils', function () { should(normalizeUrl(malformedUrl)).be.eql(malformedUrl); }); }); + + describe('#getCharsetFromCss', () => { + it('should return charset from the beginning of css (inside double quotes)', () => { + const cssText = '@charset "UTF-8"; '; + should(getCharsetFromCss(cssText)).be.eql('utf-8'); + }); + + it('should return charset from the beginning of css (inside single quotes)', () => { + const cssText = `@charset 'UTF-8'; `; + should(getCharsetFromCss(cssText)).be.eql('utf-8'); + }); + + it('should return null if no charset', () => { + const cssText = `h1 {color: red};`; + should(getCharsetFromCss(cssText)).be.eql(null); + }); + + it('should return null if charset is not valid', () => { + should(getCharsetFromCss('@charset "UTF-8"; ')).be.eql(null); + should(getCharsetFromCss(' @charset "UTF-8"; ')).be.eql(null); + should(getCharsetFromCss('@charset UTF-8;')).be.eql(null); + should(getCharsetFromCss('h1 {color: red}; @charset "UTF-8";')).be.eql(null); + }); + }); }); From a677d00c452dfcd90ca65e012404f7c4da1897a7 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 27 Aug 2022 00:07:18 +0200 Subject: [PATCH 2/6] Update encoding tests --- .../{hieroglyphs.test.js => encoding.test.js} | 20 +++++++++++++++---- .../mocks/{index.html => with-charset.html} | 1 + .../encoding/mocks/without-charset.html | 13 ++++++++++++ test/unit/plugins.test.js | 0 4 files changed, 30 insertions(+), 4 deletions(-) rename test/functional/encoding/{hieroglyphs.test.js => encoding.test.js} (50%) rename test/functional/encoding/mocks/{index.html => with-charset.html} (86%) create mode 100644 test/functional/encoding/mocks/without-charset.html create mode 100644 test/unit/plugins.test.js diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/encoding.test.js similarity index 50% rename from test/functional/encoding/hieroglyphs.test.js rename to test/functional/encoding/encoding.test.js index 57f5004d..b8c3db5f 100644 --- a/test/functional/encoding/hieroglyphs.test.js +++ b/test/functional/encoding/encoding.test.js @@ -6,7 +6,7 @@ import scrape from 'website-scraper'; const testDirname = './test/functional/encoding/.tmp'; const mockDirname = './test/functional/encoding/mocks'; -describe('Functional: UTF8 characters are properly encoded/decoded', () => { +describe('Functional: encoding', () => { const options = { urls: [ 'http://example.com/', @@ -26,11 +26,22 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => { await fs.rm(testDirname, { recursive: true, force: true }); }); - beforeEach(() => { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'}); + it('should save the page with enconding from http response headers', async () => { + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/without-charset.html', {'content-type': 'text/html; charset=utf-8'}); + + await scrape(options); + + const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' }); + scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); + scrapedIndex.should.be.containEql('
Слава Україні!
'); + scrapedIndex.should.be.containEql('
加入网站
'); + scrapedIndex.should.be.containEql('
Обладнання та ПЗ
'); + scrapedIndex.should.be.containEql('
PAR PASSION DU VÉLO
'); }); - it('should save the page in the same data as it was originally', async () => { + it('should save the page with enconding from html meta tag', async () => { + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/with-charset.html', {'content-type': 'text/html'}); + await scrape(options); const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' }); @@ -38,5 +49,6 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => { scrapedIndex.should.be.containEql('
Слава Україні!
'); scrapedIndex.should.be.containEql('
加入网站
'); scrapedIndex.should.be.containEql('
Обладнання та ПЗ
'); + scrapedIndex.should.be.containEql('
PAR PASSION DU VÉLO
'); }); }); diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/with-charset.html similarity index 86% rename from test/functional/encoding/mocks/index.html rename to test/functional/encoding/mocks/with-charset.html index 8874cc53..112d6674 100644 --- a/test/functional/encoding/mocks/index.html +++ b/test/functional/encoding/mocks/with-charset.html @@ -9,5 +9,6 @@
Слава Україні!
加入网站
Обладнання та ПЗ
+
PAR PASSION DU VÉLO
diff --git a/test/functional/encoding/mocks/without-charset.html b/test/functional/encoding/mocks/without-charset.html new file mode 100644 index 00000000..3a0344c8 --- /dev/null +++ b/test/functional/encoding/mocks/without-charset.html @@ -0,0 +1,13 @@ + + + + Test + + +
저는 7년 동안 한국에서 살았어요.
+
Слава Україні!
+
加入网站
+
Обладнання та ПЗ
+
PAR PASSION DU VÉLO
+ + diff --git a/test/unit/plugins.test.js b/test/unit/plugins.test.js new file mode 100644 index 00000000..e69de29b From c294caec1732becef6471fc26f3861d3a478f4be Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Sun, 28 Aug 2022 21:19:41 +0200 Subject: [PATCH 3/6] Update css.test.js --- test/unit/resource-handler/css.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/resource-handler/css.test.js b/test/unit/resource-handler/css.test.js index 62dcaaeb..5dc4dbdd 100644 --- a/test/unit/resource-handler/css.test.js +++ b/test/unit/resource-handler/css.test.js @@ -26,7 +26,7 @@ describe('ResourceHandler: Css', () => { return cssHandler.handle(originalResource).then((updatedResource) => { should(updatedResource).be.equal(originalResource); - should(updatedResource.getEncoding()).be.eql('utf-8'); + should(updatedResource.getEncoding()).be.eql('utf8'); }); }); }); From ac129b1d80ea8442378b105cc26dbcef71977354 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Mon, 29 Aug 2022 21:36:58 +0200 Subject: [PATCH 4/6] Mv update encoding logic outside of resource --- lib/resource-handler/css/index.js | 6 ++---- lib/resource-handler/html/index.js | 5 ++--- lib/resource.js | 16 +++------------- lib/scraper.js | 3 ++- lib/utils/index.js | 16 +++++++++++++++- 5 files changed, 24 insertions(+), 22 deletions(-) diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index ced2ac72..ffb5ac8a 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -1,6 +1,6 @@ import CssText from './../path-containers/css-text.js'; import logger from '../../logger.js'; -import { getCharsetFromCss } from '../../utils/index.js'; +import { getCharsetFromCss, updateResourceEncoding } from '../../utils/index.js'; class CssResourceHandler { constructor (options, methods) { @@ -23,9 +23,7 @@ class CssResourceHandler { function prepareToLoad (resource) { const charset = getCharsetFromCss(resource.getText()); if (charset && charset === 'utf-8') { // TODO: support more charsets here? - const enconding = 'utf8'; - logger.debug(`@charset="${charset}" found in ${resource}, changing encoding to ${enconding}`); - resource.setEncoding(enconding); + updateResourceEncoding(resource, 'utf8'); } } diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 315bd9aa..62ea1320 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -1,5 +1,5 @@ import cheerio from 'cheerio'; -import { union, getUrl, series } from '../../utils/index.js'; +import { union, getUrl, series, updateResourceEncoding } from '../../utils/index.js'; import logger from '../../logger.js'; import HtmlSourceElement from './html-source-element.js'; @@ -88,8 +88,7 @@ function prepareToLoad (resource) { const charset = el.attr('charset')?.toLowerCase(); if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents const enconding = 'utf8'; - logger.debug(` found in ${resource}, changing encoding to ${enconding}`); - resource.setEncoding(enconding); + updateResourceEncoding(resource, 'utf8'); } }); } diff --git a/lib/resource.js b/lib/resource.js index 6f152506..c57ebf1d 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -70,18 +70,8 @@ class Resource { return this.type; } - setEncoding (newEncoding) { - if (this.encoding === newEncoding) { - return; - } - - // this is needed for cases when enconding changes for a resource with existing text - // e.g. http response headers has one encoding but resource content has another encoding so we need to change the text - if (this.text) { - this.text = Buffer.from(this.text, this.encoding).toString(newEncoding); - } - - this.encoding = newEncoding; + setEncoding (encoding) { + this.encoding = encoding; } getEncoding () { @@ -97,7 +87,7 @@ class Resource { } toString () { - return '{ url: "' + this.getUrl() + '", filename: "' + this.getFilename() + '", depth: ' + this.getDepth() + ' }'; + return `{ url: "${this.getUrl()}", filename: "${this.getFilename()}", depth: ${this.getDepth()}, type: "${this.getType()}" }`; } isSaved () { diff --git a/lib/scraper.js b/lib/scraper.js index e14a3d4d..ff2d810d 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -170,7 +170,6 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } - resource.setEncoding(responseData.encoding); resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); @@ -185,7 +184,9 @@ class Scraper { resource.setMetadata(responseData.metadata); } + resource.setEncoding(responseData.encoding); resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; }).catch(function handleError (err) { diff --git a/lib/utils/index.js b/lib/utils/index.js index 4e380ffe..27f42779 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -174,6 +174,19 @@ function getCharsetFromCss (cssText) { } } +function updateResourceEncoding (resource, encoding) { + logger.debug(`updating encoding of resource ${resource} to ${encoding}`); + + const resourceText = resource.getText(); + + if (resourceText) { + const updatedText = Buffer.from(resourceText, resource.getEncoding()).toString(encoding); + resource.setText(updatedText); + } + + resource.setEncoding(encoding); +} + export { isUrl, getUrl, @@ -195,5 +208,6 @@ export { union, isPlainObject, series, - getCharsetFromCss + getCharsetFromCss, + updateResourceEncoding }; From 8bac08e73841b7b00d4b7694a9910bda15b8039d Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Mon, 29 Aug 2022 21:39:35 +0200 Subject: [PATCH 5/6] Cleanup --- lib/resource-handler/css/index.js | 1 - lib/resource-handler/html/index.js | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index ffb5ac8a..a6cca474 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -1,5 +1,4 @@ import CssText from './../path-containers/css-text.js'; -import logger from '../../logger.js'; import { getCharsetFromCss, updateResourceEncoding } from '../../utils/index.js'; class CssResourceHandler { diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 62ea1320..3e56b9f5 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -87,7 +87,6 @@ function prepareToLoad (resource) { const el = $(element); const charset = el.attr('charset')?.toLowerCase(); if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents - const enconding = 'utf8'; updateResourceEncoding(resource, 'utf8'); } }); From 57ae6e91d6aa79bf71a974088f9d4986a13a2974 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Mon, 29 Aug 2022 21:47:28 +0200 Subject: [PATCH 6/6] Update comment --- lib/resource-handler/css/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index a6cca474..9196bafa 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -21,7 +21,7 @@ class CssResourceHandler { function prepareToLoad (resource) { const charset = getCharsetFromCss(resource.getText()); - if (charset && charset === 'utf-8') { // TODO: support more charsets here? + if (charset && charset === 'utf-8') { // do we need to support more charsets here? updateResourceEncoding(resource, 'utf8'); } }