From b2b78e1bd404273406b2087af809198f74fc7e95 Mon Sep 17 00:00:00 2001 From: Micha Date: Thu, 27 Jan 2022 10:10:52 +0100 Subject: [PATCH] raw table data output --- README.md | 5 +++++ lib/pdf2table.js | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ef26307..0f8edef 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,11 @@ fs.readFile('./test.pdf', function (err, buffer) { }); }); +``` +### Getting raw table data +X-axis data is being stripped in the default mode, but may be necessary to reconstruct the table extracted from the pdf. This can be retrieved by passing 'true' as the raw argument, which is false by default. +``` +pdf2table.parse(buffer, function, raw = false) ``` ## Note diff --git a/lib/pdf2table.js b/lib/pdf2table.js index 91afc28..ab8711f 100644 --- a/lib/pdf2table.js +++ b/lib/pdf2table.js @@ -25,15 +25,15 @@ SOFTWARE. var path = require('path'); var PDFParser = require("pdf2json/pdfparser"); - -function parse (pdfBuffer, callback) { +// will return raw cell data including x-coordinates if raw = true +function parse (pdfBuffer, callback, raw = false) { var pdfParser = new PDFParser(); // adding try/catch/printstack 'cause pdfParser seems to prevent errors from bubbing up (weird implementation). // It also doesn't seem to implement the callback(err, otherdata) convention used in most Node.js modules, so let's fix that here. pdfParser.on("pdfParser_dataReady", function (data) { try{ - pdfParserCallback(null, data); + pdfParserCallback(null, data, raw); }catch(err){ console.log(err.stack); } @@ -41,14 +41,14 @@ function parse (pdfBuffer, callback) { pdfParser.on("pdfParser_dataError", function (err) { try{ - pdfParserCallback(err, null); + pdfParserCallback(err, null, raw); }catch(err){ console.log(err.stack); } }); - function pdfParserCallback (err, data) { + function pdfParserCallback (err, data, raw) { if(err) return callback(err); @@ -176,7 +176,7 @@ function parse (pdfBuffer, callback) { var rowEntries = [] var row = myPages[p][r].data; for (var i = 0; i < row.length; i++) { - rowEntries.push(row[i].text) + rowEntries.push(raw ? row[i] : row[i].text) } // now append the extracted and ordered text into the return rows. rows.push(rowEntries); };