diff --git a/.eslintrc.yml b/.eslintrc.yml index 2eabecba..241b9d56 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -1,6 +1,6 @@ extends: "eslint:recommended" parserOptions: - ecmaVersion: 8 + ecmaVersion: 2020 sourceType: "module" env: node: true diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index e76b60c0..9196bafa 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -1,4 +1,5 @@ import CssText from './../path-containers/css-text.js'; +import { getCharsetFromCss, updateResourceEncoding } from '../../utils/index.js'; class CssResourceHandler { constructor (options, methods) { @@ -7,12 +8,21 @@ class CssResourceHandler { this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources); } - handle (resource) { + async handle (resource) { + prepareToLoad(resource); + const pathContainer = new CssText(resource.getText()); - return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) { - resource.setText(updatedText); - return resource; - }); + + const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources); + resource.setText(updatedText); + return resource; + } +} + +function prepareToLoad (resource) { + const charset = getCharsetFromCss(resource.getText()); + if (charset && charset === 'utf-8') { // do we need to support more charsets here? + updateResourceEncoding(resource, 'utf8'); } } diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 4c1d6a7a..3e56b9f5 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -1,5 +1,5 @@ import cheerio from 'cheerio'; -import { union, getUrl, series } from '../../utils/index.js'; +import { union, getUrl, series, updateResourceEncoding } from '../../utils/index.js'; import logger from '../../logger.js'; import HtmlSourceElement from './html-source-element.js'; @@ -23,9 +23,8 @@ class HtmlResourceHandler { } async handle (resource) { + prepareToLoad(resource); const $ = loadTextToCheerio(resource.getText()); - prepareToLoad($, resource); - const sourceRulesLoadPromises = this.allSources.map( rule => this.loadResourcesForRule.bind(this, $, resource, rule) ); @@ -68,16 +67,29 @@ class HtmlResourceHandler { } } -function prepareToLoad ($, resource) { - $('base').each((i, element) => { +function prepareToLoad (resource) { + const $ = loadTextToCheerio(resource.getText()); + + $('base[href]').each((i, element) => { const el = $(element); const href = el.attr('href'); if (href) { const newUrl = getUrl(resource.getUrl(), href); + logger.debug(` tag found in resource ${resource}, changing url to ${newUrl}`); resource.setUrl(newUrl); + el.remove(); + resource.setText($.html()); } }); + + $('meta[charset]').each((i, element) => { + const el = $(element); + const charset = el.attr('charset')?.toLowerCase(); + if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents + updateResourceEncoding(resource, 'utf8'); + } + }); } function loadTextToCheerio (text) { diff --git a/lib/resource.js b/lib/resource.js index ae78886c..c57ebf1d 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -87,7 +87,7 @@ class Resource { } toString () { - return '{ url: "' + this.getUrl() + '", filename: "' + this.getFilename() + '", depth: ' + this.getDepth() + ' }'; + return `{ url: "${this.getUrl()}", filename: "${this.getFilename()}", depth: ${this.getDepth()}, type: "${this.getType()}" }`; } isSaved () { diff --git a/lib/scraper.js b/lib/scraper.js index e14a3d4d..ff2d810d 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -170,7 +170,6 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } - resource.setEncoding(responseData.encoding); resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); @@ -185,7 +184,9 @@ class Scraper { resource.setMetadata(responseData.metadata); } + resource.setEncoding(responseData.encoding); resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; }).catch(function handleError (err) { diff --git a/lib/utils/index.js b/lib/utils/index.js index 97c4b7be..27f42779 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -161,6 +161,32 @@ async function series (promises) { return results; } +function getCharsetFromCss (cssText) { + const CHARSET_REGEXP = /(?:@charset\s)(("(.*?)")|('(.*?)'))[\s;]/; + const hasCharset = cssText.startsWith('@charset'); + + if (hasCharset) { + const charsetMatch = CHARSET_REGEXP.exec(cssText); + const charset = charsetMatch?.[3] || charsetMatch?.[5]; + return charset?.toLowerCase() ?? null; + } else { + return null; + } +} + +function updateResourceEncoding (resource, encoding) { + logger.debug(`updating encoding of resource ${resource} to ${encoding}`); + + const resourceText = resource.getText(); + + if (resourceText) { + const updatedText = Buffer.from(resourceText, resource.getEncoding()).toString(encoding); + resource.setText(updatedText); + } + + resource.setEncoding(encoding); +} + export { isUrl, getUrl, @@ -181,5 +207,7 @@ export { extend, union, isPlainObject, - series + series, + getCharsetFromCss, + updateResourceEncoding }; diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/encoding.test.js similarity index 50% rename from test/functional/encoding/hieroglyphs.test.js rename to test/functional/encoding/encoding.test.js index 57f5004d..b8c3db5f 100644 --- a/test/functional/encoding/hieroglyphs.test.js +++ b/test/functional/encoding/encoding.test.js @@ -6,7 +6,7 @@ import scrape from 'website-scraper'; const testDirname = './test/functional/encoding/.tmp'; const mockDirname = './test/functional/encoding/mocks'; -describe('Functional: UTF8 characters are properly encoded/decoded', () => { +describe('Functional: encoding', () => { const options = { urls: [ 'http://example.com/', @@ -26,11 +26,22 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => { await fs.rm(testDirname, { recursive: true, force: true }); }); - beforeEach(() => { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'}); + it('should save the page with enconding from http response headers', async () => { + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/without-charset.html', {'content-type': 'text/html; charset=utf-8'}); + + await scrape(options); + + const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' }); + scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); + scrapedIndex.should.be.containEql('
Слава Україні!
'); + scrapedIndex.should.be.containEql('
加入网站
'); + scrapedIndex.should.be.containEql('
Обладнання та ПЗ
'); + scrapedIndex.should.be.containEql('
PAR PASSION DU VÉLO
'); }); - it('should save the page in the same data as it was originally', async () => { + it('should save the page with enconding from html meta tag', async () => { + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/with-charset.html', {'content-type': 'text/html'}); + await scrape(options); const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' }); @@ -38,5 +49,6 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => { scrapedIndex.should.be.containEql('
Слава Україні!
'); scrapedIndex.should.be.containEql('
加入网站
'); scrapedIndex.should.be.containEql('
Обладнання та ПЗ
'); + scrapedIndex.should.be.containEql('
PAR PASSION DU VÉLO
'); }); }); diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/with-charset.html similarity index 86% rename from test/functional/encoding/mocks/index.html rename to test/functional/encoding/mocks/with-charset.html index 8874cc53..112d6674 100644 --- a/test/functional/encoding/mocks/index.html +++ b/test/functional/encoding/mocks/with-charset.html @@ -9,5 +9,6 @@
Слава Україні!
加入网站
Обладнання та ПЗ
+
PAR PASSION DU VÉLO
diff --git a/test/functional/encoding/mocks/without-charset.html b/test/functional/encoding/mocks/without-charset.html new file mode 100644 index 00000000..3a0344c8 --- /dev/null +++ b/test/functional/encoding/mocks/without-charset.html @@ -0,0 +1,13 @@ + + + + Test + + +
저는 7년 동안 한국에서 살았어요.
+
Слава Україні!
+
加入网站
+
Обладнання та ПЗ
+
PAR PASSION DU VÉLO
+ + diff --git a/test/unit/plugins.test.js b/test/unit/plugins.test.js new file mode 100644 index 00000000..e69de29b diff --git a/test/unit/resource-handler/css.test.js b/test/unit/resource-handler/css.test.js index 13b1eec3..5dc4dbdd 100644 --- a/test/unit/resource-handler/css.test.js +++ b/test/unit/resource-handler/css.test.js @@ -8,6 +8,7 @@ describe('ResourceHandler: Css', () => { const downloadChildrenPaths = sinon.stub().resolves('updated text'); const originalResource = new Resource('http://example.com'); + originalResource.setText('original css text'); const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths}); return cssHandler.handle(originalResource).then((updatedResource) => { @@ -15,4 +16,17 @@ describe('ResourceHandler: Css', () => { should(updatedResource.getText()).be.eql('updated text'); }); }); + + it('should update resource encoding if charset found', () => { + const downloadChildrenPaths = sinon.stub().resolves('updated text'); + + const originalResource = new Resource('http://example.com'); + originalResource.setText('@charset "UTF-8";'); + const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths}); + + return cssHandler.handle(originalResource).then((updatedResource) => { + should(updatedResource).be.equal(originalResource); + should(updatedResource.getEncoding()).be.eql('utf8'); + }); + }); }); diff --git a/test/unit/resource-handler/html.test.js b/test/unit/resource-handler/html.test.js index eefe6a35..0882c4e0 100644 --- a/test/unit/resource-handler/html.test.js +++ b/test/unit/resource-handler/html.test.js @@ -125,6 +125,28 @@ describe('ResourceHandler: Html', () => { }); }); + describe(' tag', () => { + beforeEach(() => { + htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); + }); + + it('should change encoding of resouce if html contains with charset attr', async () => { + const html = ` + + + + + + + `; + const resource = new Resource('http://example.com', 'index.html'); + resource.setText(html); + + await htmlHandler.handle(resource); + should(resource.getEncoding()).eql('utf8'); + }); + }); + it('should not encode text to html entities', () => { htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); const html = ` diff --git a/test/unit/utils/utils-test.js b/test/unit/utils/utils-test.js index 80aab4a0..710bb3bd 100644 --- a/test/unit/utils/utils-test.js +++ b/test/unit/utils/utils-test.js @@ -5,7 +5,7 @@ import { getFilepathFromUrl, getHashFromUrl, getRelativePath, shortenFilename, prettifyFilename, isUriSchemaSupported, urlsEqual, - normalizeUrl + normalizeUrl, getCharsetFromCss } from '../../../lib/utils/index.js'; describe('Utils', function () { @@ -231,4 +231,28 @@ describe('Utils', function () { should(normalizeUrl(malformedUrl)).be.eql(malformedUrl); }); }); + + describe('#getCharsetFromCss', () => { + it('should return charset from the beginning of css (inside double quotes)', () => { + const cssText = '@charset "UTF-8"; '; + should(getCharsetFromCss(cssText)).be.eql('utf-8'); + }); + + it('should return charset from the beginning of css (inside single quotes)', () => { + const cssText = `@charset 'UTF-8'; `; + should(getCharsetFromCss(cssText)).be.eql('utf-8'); + }); + + it('should return null if no charset', () => { + const cssText = `h1 {color: red};`; + should(getCharsetFromCss(cssText)).be.eql(null); + }); + + it('should return null if charset is not valid', () => { + should(getCharsetFromCss('@charset "UTF-8"; ')).be.eql(null); + should(getCharsetFromCss(' @charset "UTF-8"; ')).be.eql(null); + should(getCharsetFromCss('@charset UTF-8;')).be.eql(null); + should(getCharsetFromCss('h1 {color: red}; @charset "UTF-8";')).be.eql(null); + }); + }); });