website-scraper · phawxby · Jun 18, 2022 · Jun 18, 2022 · Jun 18, 2022 · Jun 18, 2022
diff --git a/README.md b/README.md
@@ -57,6 +57,8 @@ scrape(options).then((result) => {});
 * [urlFilter](#urlfilter) - skip some urls
 * [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource
 * [requestConcurrency](#requestconcurrency) - set maximum concurrent requests
+* [tempMode](#tempMode) - How to store data temporarily during processing
+* [tempDir](#tempMode) - The directory to use to store temp files when `tempMode === fs`
 * [plugins](#plugins) - plugins, allow to customize filenames, request options, response handling, saving to storage, etc.
 
 Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using 
@@ -83,15 +85,44 @@ How to download website to existing directory and why it's not supported by defa
 
 #### sources
 Array of objects to download, specifies selectors and attribute values to select files for downloading. By default scraper tries to download all possible resources. Scraper uses cheerio to select html elements so `selector` can be any [selector that cheerio supports](https://github.com/cheeriojs/cheerio#selectors).
+
+You can also specify custom `containerClass`', these are responsible for readying and writing from attributes. For example if you want to read JSON from an attribute...
+
 ```javascript
+class JsonContainerClass {
+  constructor (text) {
+    this.text = text || '';
+    this.paths = [];
+
+    if (this.text) {
+      this.paths = JSON.parse(this.text);
+    }
+  }
+
+  getPaths () {
+    return this.paths;
+  }
+
+  updateText (pathsToUpdate) {
+    this.paths = this.paths.map((oldPath) => {
+      const toUpdate = pathsToUpdate.find((x) => x.oldPath === oldPath);
+
+      return toUpdate ? toUpdate.newPath : oldPath;
+    });
+
+    return JSON.stringify(this.paths);
+  }
+}
+
 // Downloading images, css files and scripts
 scrape({
   urls: ['http://nodejs.org/'],
   directory: '/path/to/save',
   sources: [
-    {selector: 'img', attr: 'src'},
-    {selector: 'link[rel="stylesheet"]', attr: 'href'},
-    {selector: 'script', attr: 'src'}
+    { selector: 'img', attr: 'src' },
+    { selector: 'link[rel="stylesheet"]', attr: 'href' },
+    { selector: 'script', attr: 'src' },
+    { selector: 'div', attr: 'data-json', containerClass: JsonContainerClass }
   ]
 });
 ```
@@ -199,6 +230,13 @@ scrape({
 #### requestConcurrency
 Number, maximum amount of concurrent requests. Defaults to `Infinity`.
 
+#### tempMode
+
+How to store temporary data when processing
+
+* `memory` - Data is store in memory in its raw format (default).
+* `memory-compressed` - Data is stored in memory but compressed using zlib. This is more memory efficient at the expense of CPU time spend compressing and decompressing.
+* `filesystem` - Data is stored in temporary files on the filesystem. This is the most memory efficient but it is strongly recommended to only use this mode with a solid state drive.
 
 #### plugins
 
@@ -331,7 +369,6 @@ Promise should be resolved with:
   * `body` (response body, string)
   * `encoding` (`binary` or `utf8`) used to save the file, binary used by default.
   * `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
-* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem. 
 
 If multiple actions `afterResponse` added - scraper will use result from last one.
 ```javascript
@@ -430,7 +467,7 @@ If multiple actions `saveResource` added - resource will be saved to multiple st
 ```javascript
 registerAction('saveResource', async ({resource}) => {
   const filename = resource.getFilename();
-  const text = resource.getText();
+  const text = await resource.getText();
   await saveItSomewhere(filename, text);
 });
 ```

diff --git a/lib/config/defaults.js b/lib/config/defaults.js
@@ -63,7 +63,9 @@ const config = {
 	recursive: false,
 	maxRecursiveDepth: null,
 	maxDepth: null,
-	ignoreErrors: false
+	ignoreErrors: false,
+	tempMode: 'memory', // 'memory-compressed', 'fs'
+	tempDir: undefined
 };
 
 export default config;
diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js
@@ -1,32 +1,36 @@
 import path from 'path';
-import fs from 'fs-extra';
+import { promises as fs } from 'fs';
+import { exists } from '../utils/index.js';
 
 class SaveResourceToFileSystemPlugin {
 	apply (registerAction) {
 		let absoluteDirectoryPath, loadedResources = [];
 
-		registerAction('beforeStart', ({options}) => {
+		registerAction('beforeStart', async ({options}) => {
 			if (!options.directory || typeof options.directory !== 'string') {
 				throw new Error(`Incorrect directory ${options.directory}`);
 			}
 
 			absoluteDirectoryPath = path.resolve(process.cwd(), options.directory);
 
-			if (fs.existsSync(absoluteDirectoryPath)) {
+			if (await exists(absoluteDirectoryPath)) {
 				throw new Error(`Directory ${absoluteDirectoryPath} exists`);
 			}
 		});
 
 		registerAction('saveResource', async ({resource}) => {
 			const filename = path.join(absoluteDirectoryPath, resource.getFilename());
-			const text = resource.getText();
-			await fs.outputFile(filename, text, { encoding: resource.getEncoding() });
+			await fs.mkdir(path.dirname(filename), { recursive: true });
+
+			const text = await resource.getText();
+
+			await fs.writeFile(filename, text, { encoding: resource.getEncoding() });
 			loadedResources.push(resource);
 		});
 
 		registerAction('error', async () => {
 			if (loadedResources.length > 0) {
-				await fs.remove(absoluteDirectoryPath);
+				await fs.rm(absoluteDirectoryPath, { recursive: true, force: true });
 			}
 		});
 	}

diff --git a/lib/request.js b/lib/request.js
@@ -41,6 +41,10 @@ function throwTypeError (result) {
 }
 
 function getData (result) {
+	if (typeof result === 'string') {
+		throw new Error('afterResponse handler returned a string, expected object');
+	}
+
 	let data = result;
 	if (result && typeof result === 'object' && 'body' in result) {
 		data = result.body;

diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js
@@ -7,12 +7,13 @@ class CssResourceHandler {
 		this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources);
 	}
 
-	handle (resource) {
-		const pathContainer = new CssText(resource.getText());
-		return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) {
-			resource.setText(updatedText);
-			return resource;
-		});
+	async handle (resource) {
+		const pathContainer = new CssText(await resource.getText());
+
+		const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources);
+		await resource.setText(updatedText);
+
+		return resource;
 	}
 }
 

diff --git a/lib/resource-handler/html/html-source-element.js b/lib/resource-handler/html/html-source-element.js
@@ -51,7 +51,7 @@ class HtmlSourceElement {
 	 */
 	getPathContainer () {
 		const selectedRule = this.findMatchedRule(pathContainersByRule);
-		const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag;
+		const ContainerClass = this.rule.containerClass || (selectedRule ? selectedRule.containerClass : CommonTag);
 		const textWithResources = this.getData();
 		return textWithResources ? new ContainerClass(textWithResources) : null;
 	}

diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js
@@ -23,15 +23,15 @@ class HtmlResourceHandler {
 	}
 
 	async handle (resource) {
-		const $ = loadTextToCheerio(resource.getText());
+		const $ = loadTextToCheerio(await resource.getText());
 		prepareToLoad($, resource);
 
 		const sourceRulesLoadPromises = this.allSources.map(
 			rule => this.loadResourcesForRule.bind(this, $, resource, rule)
 		);
 		await series(sourceRulesLoadPromises);
 
-		resource.setText($.html());
+		await resource.setText($.html());
 		return resource;
 	}
 

diff --git a/lib/resource.js b/lib/resource.js
@@ -1,9 +1,20 @@
 import types from './config/resource-types.js';
+import crypto from 'crypto';
+import fs from 'fs/promises';
+import path from 'path';
+import { compress, decompress } from './utils/index.js';
 
 class Resource {
-	constructor (url, filename) {
-		this.url = url;
-		this.filename = filename;
+	constructor (url, filename, tempMode, tempDir) {
+		this.tempMode = tempMode || 'memory';
+		this.tempDir = tempDir;
+
+		if (this.tempMode === 'filesystem' && !this.tempDir) {
+			throw new Error('tmpDir must be provided in tmpMode=filesystem');
+		}
+
+		this.setUrl(url);
+		this.setFilename(filename);
 
 		this.type = null;
 		this.depth = 0;
@@ -16,7 +27,7 @@ class Resource {
 	}
 
 	createChild (url, filename) {
-		const child = new Resource(url, filename);
+		const child = new Resource(url, filename, this.tempMode, this.tempDir);
 		let currentDepth = this.getDepth();
 
 		child.parent = this;
@@ -39,6 +50,12 @@ class Resource {
 	}
 
 	setUrl (url) {
+		if (this.tempDir) {
+			// Generate a unique filename based on the md5 hash of the url
+			const tmpName = `${crypto.createHash('md5').update(url).digest('hex')}.txt`;
+			this.tempPath = path.join(this.tempDir, tmpName);
+		}
+
 		this.url = url;
 	}
 
@@ -50,12 +67,34 @@ class Resource {
 		this.filename = filename;
 	}
 
-	getText () {
-		return this.text;
+	async getText () {
+		switch (this.tempMode) {
+			case 'memory':
+				return await this.text;
+			case 'memory-compressed':
+				return (await decompress(this.text)).toString(this.getEncoding());
+			case 'filesystem':
+				return await fs.readFile(this.tempPath, { encoding: this.getEncoding() });
+			default:
+				throw new Error(`Unknown tempMode: ${this.tempMode}`);
+		}
 	}
 
-	setText (text) {
-		this.text = text;
+	async setText (text) {
+		switch (this.tempMode) {
+			case 'memory':
+				this.text = text;
+				break;
+			case 'memory-compressed':
+				this.text = await compress(text);
+				break;
+			case 'filesystem':
+				await fs.mkdir(this.tempDir, { recursive: true });
+				await fs.writeFile(this.tempPath, text, { encoding: this.getEncoding() });
+				break;
+			default:
+				throw new Error(`Unknown tempMode: ${this.tempMode}`);
+		}
 	}
 
 	getDepth () {

diff --git a/lib/scraper.js b/lib/scraper.js
@@ -11,6 +11,9 @@ import {
 	GenerateFilenameByTypePlugin,
 	GetResourceReferencePlugin
 } from './plugins/index.js';
+import fs from 'fs';
+import path from 'path';
+import os from 'os';
 
 import * as utils from './utils/index.js';
 const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils;
@@ -47,7 +50,16 @@ class Scraper {
 			requestResource: this.requestResource.bind(this),
 			getReference: this.runActions.bind(this, 'getReference')
 		});
-		this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename));
+
+		logger.info('tmpMode', this.options.tempMode);
+		if (this.options.tempMode === 'filesystem') {
+			if (!this.options.tempDir) {
+				this.options.tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'website-scraper-'));
+			}
+			logger.info('tmpDir', this.options.tempDir);
+		}
+
+		this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename, this.options.tempMode, this.options.tempDir));
 
 		this.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise
 		this.loadedResources = new NormalizedUrlMap(); // Map url -> resource
@@ -185,7 +197,8 @@ class Scraper {
 					resource.setMetadata(responseData.metadata);
 				}
 
-				resource.setText(responseData.body);
+				await resource.setText(responseData.body);
+
 				self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
 				return resource;
 			}).catch(function handleError (err) {
@@ -280,6 +293,10 @@ class Scraper {
 			throw error;
 		} finally {
 			await this.runActions('afterFinish');
+
+			if (this.options.tempDir) {
+				await fs.promises.rm(this.options.tempDir, { recursive: true, force: true });
+			}
 		}
 	}
 }