From 5a16c4cb8d283903e09cb6d6dff65ecc11879add Mon Sep 17 00:00:00 2001 From: Luca Spiller Date: Mon, 11 Jul 2016 15:52:31 +0200 Subject: [PATCH] Strip attachments from emails when converting --- lib/heathen/processor_methods/rfc822totext.rb | 12 +++++++ lib/heathen/task.rb | 6 ++++ spec/fixtures/heathen/email.eml | 34 +++++++++++++++++++ .../processor_methods/rfc822totext_spec.rb | 23 +++++++++++++ 4 files changed, 75 insertions(+) create mode 100644 lib/heathen/processor_methods/rfc822totext.rb create mode 100644 spec/fixtures/heathen/email.eml create mode 100644 spec/heathen/processor_methods/rfc822totext_spec.rb diff --git a/lib/heathen/processor_methods/rfc822totext.rb b/lib/heathen/processor_methods/rfc822totext.rb new file mode 100644 index 0000000..af35c05 --- /dev/null +++ b/lib/heathen/processor_methods/rfc822totext.rb @@ -0,0 +1,12 @@ +require 'mail' + +module Heathen + class Processor + def rfc822totext + expect_mime_type 'message/rfc822' + + mail = Mail.read(job.content_file).without_attachments! + job.content = mail.to_s + end + end +end diff --git a/lib/heathen/task.rb b/lib/heathen/task.rb index 872d013..da47a05 100644 --- a/lib/heathen/task.rb +++ b/lib/heathen/task.rb @@ -71,6 +71,10 @@ def task_key action, mime_type perform_task 'ocr' when %r[text/html] wkhtmltopdf '-d 100 --encoding UTF-8' + when %r[message/rfc822] + rfc822totext + job.reset_content_file! + libreoffice format: 'pdf' else libreoffice format: 'pdf' end @@ -92,6 +96,8 @@ def task_key action, mime_type htmltotext when %r[application/pdf] pdftotext + when %r[message/rfc822] + rfc822totext else libreoffice format: 'txt' end diff --git a/spec/fixtures/heathen/email.eml b/spec/fixtures/heathen/email.eml new file mode 100644 index 0000000..6531f03 --- /dev/null +++ b/spec/fixtures/heathen/email.eml @@ -0,0 +1,34 @@ + +MIME-Version: 1.0 +Received: by 10.79.89.194 with HTTP; Mon, 11 Jul 2016 06:41:56 -0700 (PDT) +Date: Mon, 11 Jul 2016 16:41:56 +0300 +Delivered-To: employee@example.com +Message-ID: +Subject: Important Reports +From: Boss +To: Employee +Content-Type: multipart/mixed; boundary=001a114a86e6d9bf9605375c50db + +--001a114a86e6d9bf9605375c50db +Content-Type: multipart/alternative; boundary=001a114a86e6d9bf7a05375c50d9 + +--001a114a86e6d9bf7a05375c50d9 +Content-Type: text/plain; charset=UTF-8 + +Attached are the important reports. + +--001a114a86e6d9bf7a05375c50d9 +Content-Type: text/html; charset=UTF-8 + +
Attached are the important reports. +
+ +--001a114a86e6d9bf7a05375c50d9-- +--001a114a86e6d9bf9605375c50db +Content-Type: text/csv; charset=US-ASCII; name="reports.csv" +Content-Disposition: attachment; filename="reports.csv" +Content-Transfer-Encoding: base64 +X-Attachment-Id: f_iqi2vbfj0 + +b25lLHR3byx0aHJlZQoxLDIsMwo= +--001a114a86e6d9bf9605375c50db-- diff --git a/spec/heathen/processor_methods/rfc822totext_spec.rb b/spec/heathen/processor_methods/rfc822totext_spec.rb new file mode 100644 index 0000000..918634a --- /dev/null +++ b/spec/heathen/processor_methods/rfc822totext_spec.rb @@ -0,0 +1,23 @@ +require 'spec_helper' + +describe Heathen::Processor do + let(:content) { File.read(fixture('heathen/email.eml')) } + let(:job) { Heathen::Job.new 'foo', content, 'en' } + let(:processor) { described_class.new job: job, logger: Logger.new($stderr) } + + before do + allow(content).to receive(:mime_type).and_return('message/rfc822') + end + + after do + processor.clean_up + end + + context '#rfc822totext' do + it 'strips attachments' do + processor.rfc822totext + expect(job.content.mime_type).to eq 'message/rfc822; charset=us-ascii' + expect(job.content).to_not include('reports.csv') + end + end +end