diff --git a/README.md b/README.md index d3ad33f..c1ce047 100755 --- a/README.md +++ b/README.md @@ -1,138 +1,56 @@ # Permaloom -Node.js package that archives webpages to Arweave. +Heritrix wrapper that archives webpages to Arweave ## Installation -Using [npm](https://www.npmjs.com/): +Install [Heritrix](https://github.com/internetarchive/heritrix3). -```bash -npm install permaloom -``` - -## Usage - -For a better understanding, read the Yukikaki [documentation](https://github.com/Moogamouth/Yukikaki#readme). - -You can import Permaloom using `require`: -```js -(async () => { - const permaloom = new require("permaloom")("arweave.net", 443, "https"); -})(); -``` - -Or with `import`: -```js -(async () => { - import Permaloom from "permaloom"; - await const permaloom = new Permaloom("arweave.net", 443, "https"); -})(); -``` - -You need to provide values for host, port and protocol. - -### Other class parameters - -#### headless -`Bool` - -Optional. Default is true. If false, starts crawling in headful mode. - -### .archive(options) -Scrapes data from webpages according to `options`, and archives it to arweave. - -#### options.url -`String` - -The URL to start crawling from. - -#### options.func(i, maxFee, res, page) -`Function` - -`.archive()` will run `options.func` on every webpage it crawls and returns `vals`. - -`i` and `maxFee` equate to the current values of `options.i` and `options.maxFee`. - -Note: `i` will be decremented based on how many links or sources away the page is from the starting page. +Install [cURL](https://curl.se/). -`res` -[``](https://pptr.dev/api/puppeteer.httpresponse) +Download the latest [release](https://github.com/Moogamouth/Permaloom/releases) of Permaloom, and insert it into the Heritrix directory. -Puppeteer response from the current page. +Insert an Arweave key file entitled "`key.json`" into the Permaloom directory. -`page` -[``](https://pptr.dev/api/puppeteer.page) - -Puppeteer page of the current page. +## Usage -`vals.archive` -`Bool` +To run Permaloom, simply run the Permaloom executable file. You will have to provide `metadata.operatorContactUrl` and seed values in the `crawler-beans.cxml` file. -Optional. Default is true. Draft an archive of the current page. Will be uploaded once scrape is finished. +You can edit the `crawler-beans.cxml` and `config.json` files to change the crawl settings. -`vals.ytdl` -`Bool` +The `crawler-beans.cxml` file will overwrite the one being used by Heritrix when Permaloom is executed. For more information on the `crawler-beans.cxml` file, go to [https://heritrix.readthedocs.io/en/latest/configuring-jobs.html](https://heritrix.readthedocs.io/en/latest/configuring-jobs.html). -Optional. Sets `options.ytdl` for the current page. +`config.json` -`vals.srcs` -`Bool` +Here is the default `config.json` file: +```json +{ + "host": "arweave.net", + "port": 443, + "addr": "https://localhost:8443/", + "username": "admin", + "password": "admin", + "maxFee": 100000000000000000 +} +``` -Optional. Sets `options.srcs` for the current page. +It contains all the values required in the `config.json` file. -`vals.hrefs` -`Bool` +`host` -Optional. Sets `options.hrefs` for the current page. +`String` -#### options.maxFee -`Int` +The hostname of the Arweave gateway to use. -The maximum fee to pay for the archive, in winston. The archive will cancel if the amount is exceeded. +`port` -#### options.i `Int` -Optional. Default is 1. Determines when to stop archiving trees of links and sources. If `options.i` > 1, options.hrefs will automatically be set to true. - -#### options.srcs -`Bool` - -Optional. Default is true. Scrape sources of the starting page. - -#### options.hrefs -`Bool` - -Optional. If true, scrape links, links of links, so on, stemming from the starting page. It will stop when options.i is depleted. Will automatically be set to true if `options.i` > 1. - -#### options.uploadOnGen -`Bool` +The port of the Arweave gateway to use. -Optional. If true, transactions will be uploaded one by one, on generation. This means that `options.maxFee` will be applied to each transaction singularly, instead of summing up the fees of all tranactions. Also, upload of transactions will be skipped if the transaction's webpage has already been archived after `options.after`, otherwise skip generation of transactions if the transaction's webpage has already been archived after `options.after`. +`maxFee` -#### options.after `Int` -Optional. Default is 0. Represents a Unix timestamp in milliseconds. If `options.uploadOnGen` is not set to false, upload of transactions will be skipped if the transaction's webpage has already been archived after `options.after`, otherwise skip generation of transactions if the transaction's webpage has already been archived after `options.after`. - -#### options.robots -`Bool` - -Optional. If true, only scrape pages in accordance with robots.txt. - -#### options.robotsNeutral - -`Bool` - -Optional. Default is true. Crawl pages that are neutral according to robots.txt. - -#### options.robotsSrcsHrefs -`Bool` - -Optional. Default is true. Crawl links and sources even if the current page is not compatible with robots.txt. - -#### options.ytdl -`Bool` - -Optional. If true, attempt to draft transactions of blob URLs inside of HTML ")[0].split("src=\"")[1].split("\"")[0].search("blob:") === 0) ytdlTx = true; - if (ytdlTx) console.log(await exec(`yt-dlp ${url}`)); - if (ytdlTx) arr.push(this2.arweave.createTransaction({ - data: await exec(`yt-dlp ${url}`), - tags: [{"name":"Content-Type", "value":"text/html"}, {"name":"User-Agent","value":"Permaloom/0.2.4"}, {"name":"page:url","value":videoUrl}, {"name":"page:title","value":await page.title()}, {"name":"page:timestamp","value":`${Date.now()}`}, {"name":"page:cookiesId","value":cookies.id}] - }, key)); - } - - if (!uploadOnGen) { - fee += arr[0].reward + arr[1].reward; - if (fee > maxFee) throw new Error("Fee limit exceeded"); - else return arr; - } - if (uploadOnGen && arr[0].reward + arr[0].reward < maxFee) this2.upload(arr, this2, options.key); - } - } - } - - async upload(data, this2, key) { - for (let i of data) { - const uploader = await this2.arweave.transactions.getUploader(await this2.arweave.transactions.sign(i, key)); - while (!uploader.isComplete) await uploader.uploadChunk(); - } - } - - async archive(options) { - options.this = this; - options.after = options.after ?? 0; - options.userAgent = "Permaloom/0.2.4"; - - options.func2 = options.func; - options.func = async function(options, res, page) { - const vals = await options.func2(options.i, options.maxFee, res, page); - - options.ytdl = vals.ytdl ?? options.ytdl; - - if (vals.archive != false) return await options.this.draftPage(options.uploadOnGen, options.url, options.after, options.this, options.key, options.ytdl, options.fee, options.maxFee, options.data, res, page); - - options.srcs = vals.srcs ?? options.hrefs; - options.hrefs = vals.hrefs ?? options.hrefs; - } - - await this.yukikaki.scrape(options); - - if (!options.uploadOnGen) this.upload(options.data, this, options.key); - } - -}; \ No newline at end of file