diff --git a/.eslintrc.yml b/.eslintrc.yml
index 2eabecba..241b9d56 100644
--- a/.eslintrc.yml
+++ b/.eslintrc.yml
@@ -1,6 +1,6 @@
extends: "eslint:recommended"
parserOptions:
- ecmaVersion: 8
+ ecmaVersion: 2020
sourceType: "module"
env:
node: true
diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js
index e76b60c0..9196bafa 100644
--- a/lib/resource-handler/css/index.js
+++ b/lib/resource-handler/css/index.js
@@ -1,4 +1,5 @@
import CssText from './../path-containers/css-text.js';
+import { getCharsetFromCss, updateResourceEncoding } from '../../utils/index.js';
class CssResourceHandler {
constructor (options, methods) {
@@ -7,12 +8,21 @@ class CssResourceHandler {
this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources);
}
- handle (resource) {
+ async handle (resource) {
+ prepareToLoad(resource);
+
const pathContainer = new CssText(resource.getText());
- return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) {
- resource.setText(updatedText);
- return resource;
- });
+
+ const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources);
+ resource.setText(updatedText);
+ return resource;
+ }
+}
+
+function prepareToLoad (resource) {
+ const charset = getCharsetFromCss(resource.getText());
+ if (charset && charset === 'utf-8') { // do we need to support more charsets here?
+ updateResourceEncoding(resource, 'utf8');
}
}
diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js
index 4c1d6a7a..3e56b9f5 100644
--- a/lib/resource-handler/html/index.js
+++ b/lib/resource-handler/html/index.js
@@ -1,5 +1,5 @@
import cheerio from 'cheerio';
-import { union, getUrl, series } from '../../utils/index.js';
+import { union, getUrl, series, updateResourceEncoding } from '../../utils/index.js';
import logger from '../../logger.js';
import HtmlSourceElement from './html-source-element.js';
@@ -23,9 +23,8 @@ class HtmlResourceHandler {
}
async handle (resource) {
+ prepareToLoad(resource);
const $ = loadTextToCheerio(resource.getText());
- prepareToLoad($, resource);
-
const sourceRulesLoadPromises = this.allSources.map(
rule => this.loadResourcesForRule.bind(this, $, resource, rule)
);
@@ -68,16 +67,29 @@ class HtmlResourceHandler {
}
}
-function prepareToLoad ($, resource) {
- $('base').each((i, element) => {
+function prepareToLoad (resource) {
+ const $ = loadTextToCheerio(resource.getText());
+
+ $('base[href]').each((i, element) => {
const el = $(element);
const href = el.attr('href');
if (href) {
const newUrl = getUrl(resource.getUrl(), href);
+ logger.debug(` tag found in resource ${resource}, changing url to ${newUrl}`);
resource.setUrl(newUrl);
+
el.remove();
+ resource.setText($.html());
}
});
+
+ $('meta[charset]').each((i, element) => {
+ const el = $(element);
+ const charset = el.attr('charset')?.toLowerCase();
+ if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents
+ updateResourceEncoding(resource, 'utf8');
+ }
+ });
}
function loadTextToCheerio (text) {
diff --git a/lib/resource.js b/lib/resource.js
index ae78886c..c57ebf1d 100644
--- a/lib/resource.js
+++ b/lib/resource.js
@@ -87,7 +87,7 @@ class Resource {
}
toString () {
- return '{ url: "' + this.getUrl() + '", filename: "' + this.getFilename() + '", depth: ' + this.getDepth() + ' }';
+ return `{ url: "${this.getUrl()}", filename: "${this.getFilename()}", depth: ${this.getDepth()}, type: "${this.getType()}" }`;
}
isSaved () {
diff --git a/lib/scraper.js b/lib/scraper.js
index e14a3d4d..ff2d810d 100644
--- a/lib/scraper.js
+++ b/lib/scraper.js
@@ -170,7 +170,6 @@ class Scraper {
self.requestedResourcePromises.set(responseData.url, requestPromise);
}
- resource.setEncoding(responseData.encoding);
resource.setType(getTypeByMime(responseData.mimeType));
const { filename } = await self.runActions('generateFilename', { resource, responseData });
@@ -185,7 +184,9 @@ class Scraper {
resource.setMetadata(responseData.metadata);
}
+ resource.setEncoding(responseData.encoding);
resource.setText(responseData.body);
+
self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
return resource;
}).catch(function handleError (err) {
diff --git a/lib/utils/index.js b/lib/utils/index.js
index 97c4b7be..27f42779 100644
--- a/lib/utils/index.js
+++ b/lib/utils/index.js
@@ -161,6 +161,32 @@ async function series (promises) {
return results;
}
+function getCharsetFromCss (cssText) {
+ const CHARSET_REGEXP = /(?:@charset\s)(("(.*?)")|('(.*?)'))[\s;]/;
+ const hasCharset = cssText.startsWith('@charset');
+
+ if (hasCharset) {
+ const charsetMatch = CHARSET_REGEXP.exec(cssText);
+ const charset = charsetMatch?.[3] || charsetMatch?.[5];
+ return charset?.toLowerCase() ?? null;
+ } else {
+ return null;
+ }
+}
+
+function updateResourceEncoding (resource, encoding) {
+ logger.debug(`updating encoding of resource ${resource} to ${encoding}`);
+
+ const resourceText = resource.getText();
+
+ if (resourceText) {
+ const updatedText = Buffer.from(resourceText, resource.getEncoding()).toString(encoding);
+ resource.setText(updatedText);
+ }
+
+ resource.setEncoding(encoding);
+}
+
export {
isUrl,
getUrl,
@@ -181,5 +207,7 @@ export {
extend,
union,
isPlainObject,
- series
+ series,
+ getCharsetFromCss,
+ updateResourceEncoding
};
diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/encoding.test.js
similarity index 50%
rename from test/functional/encoding/hieroglyphs.test.js
rename to test/functional/encoding/encoding.test.js
index 57f5004d..b8c3db5f 100644
--- a/test/functional/encoding/hieroglyphs.test.js
+++ b/test/functional/encoding/encoding.test.js
@@ -6,7 +6,7 @@ import scrape from 'website-scraper';
const testDirname = './test/functional/encoding/.tmp';
const mockDirname = './test/functional/encoding/mocks';
-describe('Functional: UTF8 characters are properly encoded/decoded', () => {
+describe('Functional: encoding', () => {
const options = {
urls: [
'http://example.com/',
@@ -26,11 +26,22 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => {
await fs.rm(testDirname, { recursive: true, force: true });
});
- beforeEach(() => {
- nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'});
+ it('should save the page with enconding from http response headers', async () => {
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/without-charset.html', {'content-type': 'text/html; charset=utf-8'});
+
+ await scrape(options);
+
+ const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
+ scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
');
+ scrapedIndex.should.be.containEql('Слава Україні!
');
+ scrapedIndex.should.be.containEql('加入网站
');
+ scrapedIndex.should.be.containEql('Обладнання та ПЗ
');
+ scrapedIndex.should.be.containEql('PAR PASSION DU VÉLO
');
});
- it('should save the page in the same data as it was originally', async () => {
+ it('should save the page with enconding from html meta tag', async () => {
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/with-charset.html', {'content-type': 'text/html'});
+
await scrape(options);
const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
@@ -38,5 +49,6 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => {
scrapedIndex.should.be.containEql('Слава Україні!
');
scrapedIndex.should.be.containEql('加入网站
');
scrapedIndex.should.be.containEql('Обладнання та ПЗ
');
+ scrapedIndex.should.be.containEql('PAR PASSION DU VÉLO
');
});
});
diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/with-charset.html
similarity index 86%
rename from test/functional/encoding/mocks/index.html
rename to test/functional/encoding/mocks/with-charset.html
index 8874cc53..112d6674 100644
--- a/test/functional/encoding/mocks/index.html
+++ b/test/functional/encoding/mocks/with-charset.html
@@ -9,5 +9,6 @@
Слава Україні!
加入网站
Обладнання та ПЗ
+ PAR PASSION DU VÉLO