From 7d5013010d4ef11ecac8d9f15cbe213de82ddcf1 Mon Sep 17 00:00:00 2001 From: WillNigel23 Date: Sat, 13 Jul 2024 04:48:43 +0800 Subject: [PATCH] 4225 - Migrate Process Document Upload rake to a active job --- .github/workflows/ci.yml | 1 + Gemfile | 4 +- app/controllers/application_controller.rb | 6 +- app/interactors/rake_interactor.rb | 36 +++++++++ app/interactors/work/refresh_metadata.rb | 1 + app/jobs/application_job.rb | 7 ++ app/jobs/document_upload/process_job.rb | 8 ++ app/models/document_upload.rb | 26 +++--- app/views/admin/uploads.html.slim | 4 +- config/application.rb | 2 + lib/tasks/bulk_export.rake | 10 +-- lib/tasks/ingestor.rake | 98 +++++++++++------------ 12 files changed, 126 insertions(+), 77 deletions(-) create mode 100644 app/interactors/rake_interactor.rb create mode 100644 app/jobs/application_job.rb create mode 100644 app/jobs/document_upload/process_job.rb diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 045e5b2c30..73748771a3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,6 @@ name: "CI" on: + push: {} pull_request: {} jobs: test: diff --git a/Gemfile b/Gemfile index 9c32a258de..64eb1854a4 100644 --- a/Gemfile +++ b/Gemfile @@ -13,6 +13,7 @@ gem 'jquery-rails' gem 'jquery-ui-sass-rails' gem 'mysql2' gem 'nokogiri' +gem 'postmark-rails' gem 'recaptcha', require: 'recaptcha/rails' gem 'rmagick' gem 'ruby-openai' @@ -20,9 +21,6 @@ gem 'rvm1-capistrano3', require: false gem 'savon', '~> 2.12.0' gem 'text' gem 'thredded', '~> 1.0' -gem "recaptcha", require: "recaptcha/rails" -gem 'ruby-openai' -gem 'postmark-rails' gem 'will_paginate' gem 'acts_as_list' diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 35a869491e..c02459f2bc 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -50,7 +50,7 @@ def switch_locale(&action) end # append region to locale - related_locales = http_accept_language.user_preferred_languages.select do |loc| + related_locales = http_accept_language.user_preferred_languages.select do |loc| loc.to_s.include?(locale.to_s) && # is related to the chosen locale (is the locale, or is a regional version of it) I18n.available_locales.map{|e| e.to_s}.include?(loc.to_s) # is an available locale end @@ -84,7 +84,7 @@ def guest_user def guest_transcription return head(:forbidden) unless GUEST_TRANSCRIPTION_ENABLED - + if check_recaptcha(model: @page, :attribute => :errors) User.find(session[:guest_user_id].nil? ? session[:guest_user_id] = create_guest_user.id : session[:guest_user_id]) redirect_to :controller => 'transcribe', :action => 'display_page', :page_id => @page.id @@ -198,7 +198,7 @@ def set_friendly_collection(id) elsif !DocumentSet.find_by(slug: id).nil? @collection = DocumentSet.find_by(slug: id) elsif !Collection.find_by(slug: id).nil? - @collection = Collection.find_by(slug: id) + @collection = Collection.find_by(slug: id) end # check to make sure URLs haven't gotten scrambled diff --git a/app/interactors/rake_interactor.rb b/app/interactors/rake_interactor.rb new file mode 100644 index 0000000000..c3e2c4debb --- /dev/null +++ b/app/interactors/rake_interactor.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true +require 'rake' + +class RakeInteractor + include Interactor + + def initialize(task_name: '', args: {}, log_file: nil) + @task_name = task_name + @args = args + @log_file = log_file || "#{Rails.root}/log/rake.log" + + super + end + + def call + old_stdout = $stdout + log_buffer = StringIO.new + $stdout = log_buffer + + begin + Rake::Task[@task_name].reenable + Rake::Task[@task_name].invoke(*@args.values) + rescue => e + puts "#{e.class}: #{e.message}" + puts e.backtrace.join("\n") + ensure + $stdout = old_stdout + + File.new(@log_file, 'w').close unless File.exist?(@log_file) + + File.open(@log_file, 'a') do |f| + f.puts log_buffer.string + end + end + end +end diff --git a/app/interactors/work/refresh_metadata.rb b/app/interactors/work/refresh_metadata.rb index 94578ca90d..82fa514406 100644 --- a/app/interactors/work/refresh_metadata.rb +++ b/app/interactors/work/refresh_metadata.rb @@ -19,6 +19,7 @@ def call finalize rescue => e @errors << "Error: #{e}" + context.errors = @errors context.fail! end diff --git a/app/jobs/application_job.rb b/app/jobs/application_job.rb new file mode 100644 index 0000000000..d394c3d106 --- /dev/null +++ b/app/jobs/application_job.rb @@ -0,0 +1,7 @@ +class ApplicationJob < ActiveJob::Base + # Automatically retry jobs that encountered a deadlock + # retry_on ActiveRecord::Deadlocked + + # Most jobs are safe to ignore if the underlying records are no longer available + # discard_on ActiveJob::DeserializationError +end diff --git a/app/jobs/document_upload/process_job.rb b/app/jobs/document_upload/process_job.rb new file mode 100644 index 0000000000..6be59c15f1 --- /dev/null +++ b/app/jobs/document_upload/process_job.rb @@ -0,0 +1,8 @@ +class DocumentUpload::ProcessJob < ApplicationJob + queue_as :default + + def perform(id, log_file) + args = { id: id } + RakeInteractor.new(task_name: 'fromthepage:process_document_upload', args: args, log_file: log_file).call + end +end diff --git a/app/models/document_upload.rb b/app/models/document_upload.rb index ffaec12c59..9ef70221b6 100644 --- a/app/models/document_upload.rb +++ b/app/models/document_upload.rb @@ -21,28 +21,23 @@ class DocumentUpload < ApplicationRecord belongs_to :user, optional: true belongs_to :collection, optional: true - validates :collection_id, :file, :presence => true + validates :collection_id, :file, presence: true mount_uploader :file, DocumentUploader - module Status - NEW = 'new' - QUEUED = 'queued' - PROCESSING = 'processing' - FINISHED = 'finished' - ERROR = 'error' - end + enum status: { + new: 'new', + queued: 'queued', + processing: 'processing', + finished: 'finished', + error: 'error' + }, _prefix: :status def submit_process - self.status = Status::QUEUED + self.status = :queued self.save - rake_call = "#{RAKE} fromthepage:process_document_upload[#{self.id}] --trace >> #{log_file} 2>&1 &" - - # Nice-up the rake call if settings are present - rake_call = "nice -n #{NICE_RAKE_LEVEL} " << rake_call if NICE_RAKE_ENABLED - logger.info rake_call - system(rake_call) + DocumentUpload::ProcessJob.perform_later(self.id, log_file) end def log_file @@ -54,6 +49,7 @@ def name end private + def upload_dir if self.file && self.file.path File.dirname(self.file.path) diff --git a/app/views/admin/uploads.html.slim b/app/views/admin/uploads.html.slim index d5d7aee137..b6cf788ff3 100644 --- a/app/views/admin/uploads.html.slim +++ b/app/views/admin/uploads.html.slim @@ -34,10 +34,10 @@ table.admin-grid.datagrid.striped span Actions =svg_symbol '#icon-list', class: 'icon' dd - -if document.status == DocumentUpload::Status::NEW + -if document.status == 'new' =link_to(t('.process_upload'), { :action => 'process_upload', :id => document.id }) hr - -if document.status == DocumentUpload::Status::PROCESSING || document.status == DocumentUpload::Status::FINISHED || document.status == DocumentUpload::Status::ERROR + -if document.status_processing? || document.status_finished? || document.status_error? =link_to(t('.show_processing_log'), admin_view_processing_log_path(:id => document.id), target: '_blank') hr =link_to(t('.delete_upload'), admin_delete_upload_path(:id => document.id), class: 'fgred', data: { confirm: t('.are_you_sure') }) diff --git a/config/application.rb b/config/application.rb index 067b2384e0..064366027e 100644 --- a/config/application.rb +++ b/config/application.rb @@ -46,6 +46,8 @@ class Application < Rails::Application end end + # Load rake tasks + Rails.application.load_tasks end diff --git a/lib/tasks/bulk_export.rake b/lib/tasks/bulk_export.rake index 7838372068..164a632074 100644 --- a/lib/tasks/bulk_export.rake +++ b/lib/tasks/bulk_export.rake @@ -16,18 +16,18 @@ namespace :fromthepage do bulk_export_id = args.bulk_export_id print "fetching bulk export with ID=#{bulk_export_id}\n" bulk_export = BulkExport.find bulk_export_id - + print "found bulk_export for \n\tuser=#{bulk_export.user.login}, \n" print "\tfrom collection=#{bulk_export.collection.title}\n" if bulk_export.collection pp bulk_export.attributes - + bulk_export.status = BulkExport::Status::PROCESSING bulk_export.save - -# process_batch(bulk_export, File.dirname(bulk_export.file.path), bulk_export.id.to_s) + + # process_batch(bulk_export, File.dirname(bulk_export.file.path), bulk_export.id.to_s) bulk_export.export_to_zip - bulk_export.status = DocumentUpload::Status::FINISHED + bulk_export.status = :finished bulk_export.save diff --git a/lib/tasks/ingestor.rake b/lib/tasks/ingestor.rake index f0b4867960..38e821e8fd 100644 --- a/lib/tasks/ingestor.rake +++ b/lib/tasks/ingestor.rake @@ -6,7 +6,7 @@ namespace :fromthepage do task :compress_images, [:pathname] => :environment do |t,args| pathname = args.pathname p "compressing #{pathname}" - + if Dir.exist? pathname ImageHelper.compress_files_in_dir(pathname) else @@ -26,15 +26,15 @@ namespace :fromthepage do document_upload_id = args.document_upload_id print "fetching upload with ID=#{document_upload_id}\n" document_upload = DocumentUpload.find document_upload_id - + print "found document_upload for \n\tuser=#{document_upload.user.login}, \n\ttarget collection=#{document_upload.collection.title}, \n\tfile=#{document_upload.file}\n" - - document_upload.status = DocumentUpload::Status::PROCESSING + + document_upload.status = :processing document_upload.save - + process_batch(document_upload, File.dirname(document_upload.file.path), document_upload.id.to_s) - document_upload.status = DocumentUpload::Status::FINISHED + document_upload.status = :finished document_upload.save if SMTP_ENABLED @@ -66,12 +66,12 @@ namespace :fromthepage do # clean clean_tmp_dir(temp_dir) end - + def clean_tmp_dir(temp_dir) print "Removing #{temp_dir}\n" FileUtils::rm_r(temp_dir) end - + def unzip_tree(temp_dir) print "unzip_tree(#{temp_dir})\n" ls = Dir.glob(File.join(temp_dir, "*")).sort @@ -93,7 +93,7 @@ namespace :fromthepage do end FileUtils.chmod_R "u=rwx,go=r", temp_dir end - + def unpdf_tree(temp_dir, ocr) print "unpdf_tree(#{temp_dir})\n" ls = Dir.glob(File.join(temp_dir, "*")).sort @@ -105,7 +105,7 @@ namespace :fromthepage do else if File.extname(path) == '.PDF' || File.extname(path) == '.pdf' print "\t\tunpdf_tree Found pdf #{path}\n" - #extract + #extract destination = ImageHelper.extract_pdf(path, ocr) print "\t\tunpdf_tree Extracted to #{destination}\n" # copy any metadata.yml to the destination @@ -155,10 +155,10 @@ namespace :fromthepage do destination = ImageHelper.compress_image(path) end end - end + end end - - def ingest_tree(document_upload, temp_dir) + + def ingest_tree(document_upload, temp_dir) print "ingest_tree(#{temp_dir})\n" # first process all sub-directories clean_dir=temp_dir.gsub('[','\[').gsub(']','\]') @@ -169,8 +169,8 @@ namespace :fromthepage do print "Found directory #{path}\n" ingest_tree(document_upload, path) #recurse end - end - + end + # now process this directory if it contains image files image_files = Dir.glob(File.join(clean_dir, "*.{"+IMAGE_FILE_EXTENSIONS.join(',')+"}")).sort if image_files.length > 0 @@ -179,7 +179,7 @@ namespace :fromthepage do print "Finished converting files in #{temp_dir} to a work\n" end print "Finished ingest_tree for #{temp_dir}\n" - + end ALLOWLIST = [ @@ -201,14 +201,14 @@ namespace :fromthepage do "slug" ] - + def convert_to_work(document_upload, path) print "convert_to_work creating database record for #{path}\n" print "\tconvert_to_work owner = #{document_upload.user.login}\n" print "\tconvert_to_work collection = #{document_upload.collection.title}\n" print "\tconvert_to_work default title = #{File.basename(path).ljust(3,'.')}\n" print "\tconvert_to_work looking for metadata.yml in #{File.join(File.dirname(path), 'metadata.yml')}\n" - + begin if File.exist? File.join(path, 'metadata.yml') yaml = YAML.load_file(File.join(path, 'metadata.yml')) @@ -219,13 +219,13 @@ namespace :fromthepage do yaml = nil end rescue StandardError => e - document_upload.update(status: DocumentUpload::Status::ERROR) + document_upload.update(status: :error) print "\n\nYML/YAML Failed: Exception: #{e.message}" return end print "\tconvert_to_work loaded metadata.yml values \n#{yaml.to_s}\n" - + User.current_user=document_upload.user document_sets = [] if yaml @@ -252,34 +252,34 @@ namespace :fromthepage do end work.save! - + new_dir_name = File.join(Rails.root, "public", "images", "uploaded", work.id.to_s) print "\tconvert_to_work creating #{new_dir_name}\n" - + FileUtils.mkdir_p(new_dir_name) IMAGE_FILE_EXTENSIONS.each do |ext| # print "\t\tconvert_to_work copying #{File.join(path, "*.#{ext}")} to #{new_dir_name}:\n" clean_dir=path.gsub('[','\[').gsub(']','\]') - FileUtils.cp(Dir.glob(File.join(clean_dir, "*.#{ext}")), new_dir_name) - Dir.glob(File.join(clean_dir, "*.#{ext}")).sort.each { |fn| print "\t\t\tcp #{fn} to #{new_dir_name}\n" } + FileUtils.cp(Dir.glob(File.join(clean_dir, "*.#{ext}")), new_dir_name) + Dir.glob(File.join(clean_dir, "*.#{ext}")).sort.each { |fn| print "\t\t\tcp #{fn} to #{new_dir_name}\n" } # print "\t\tconvert_to_work copied #{File.join(path, "*.#{ext}")} to #{new_dir_name}\n" - end + end # at this point, the new dir should have exactly what we want-- only image files that are adequately compressed. ls = Dir.glob(File.join(new_dir_name, "*")).sort numeric_pages, alpha_numeric_pages = ls.partition { |page| File.basename(page).to_i.positive? } sorted_numeric_pages = numeric_pages.sort_by { |page| File.basename(page).to_i } ls = sorted_numeric_pages.concat(alpha_numeric_pages) - + GC.start ls.each_with_index do |image_fn,i| page = Page.new print "\t\tconvert_to_work created new page\n" - + if document_upload.preserve_titles page.title = File.basename(image_fn, ".*") else @@ -350,15 +350,15 @@ namespace :fromthepage do collection.supports_document_sets = true collection.save! end - + document_sets end - + def temp_dir_path(seed) - File.join(Dir.tmpdir, 'fromthepage_uploads', seed) + File.join(Dir.tmpdir, 'fromthepage_uploads', seed) end - + def copy_to_temp_dir(path, temp_dir) print "creating temp directory #{temp_dir}\n" FileUtils.mkdir_p(temp_dir) @@ -368,74 +368,74 @@ namespace :fromthepage do desc "Import IIIF Collection" task :import_iiif, [:collection_url] => :environment do |t,args| - + ScCollection.delete_all ScManifest.delete_all - ScCanvas.delete_all - + ScCanvas.delete_all + collection_url = args.collection_url p "importing #{collection_url}" collection_string = "" - collection_string = open(collection_url).read + collection_string = open(collection_url).read collection_hash = JSON.parse(collection_string) sc_collection = ScCollection.new sc_collection.context = collection_hash["@context"] sc_collection.save! - + collection_hash["manifests"].each do |manifest_item| sc_manifest = ScManifest.new sc_manifest.sc_collection = sc_collection sc_manifest.sc_id = manifest_item["@id"] sc_manifest.label = manifest_item["label"] - + sc_manifest.save! - + print "Ingesting manifest #{sc_manifest.sc_id}\n" begin manifest_string = open(sc_manifest.sc_id).read manifest_hash = JSON.parse(manifest_string) - + sc_manifest.metadata = manifest_hash["metadata"].to_json if manifest_hash["metadata"] - + first_sequence = manifest_hash["sequences"].first sc_manifest.first_sequence_id = first_sequence["@id"] sc_manifest.first_sequence_label = first_sequence["label"] - + sc_manifest.save! - + first_sequence["canvases"].each do |canvas| sc_canvas = ScCanvas.new sc_canvas.sc_manifest = sc_manifest - + sc_canvas.sc_id = canvas["@id"] sc_canvas.sc_canvas_id = canvas["@id"] sc_canvas.sc_canvas_label = canvas["label"] sc_canvas.sc_canvas_width = canvas["width"] sc_canvas.sc_canvas_height = canvas["height"] - + first_image = canvas["images"].first sc_canvas.sc_image_motivation = first_image["motivation"] sc_canvas.sc_image_on = first_image["on"] - + resource = first_image["resource"] sc_canvas.sc_resource_id = resource["@id"] sc_canvas.sc_resource_type = resource["@type"] sc_canvas.sc_resource_format = resource["format"] - + service = resource["service"] sc_canvas.sc_service_id = service["@id"] sc_canvas.sc_service_context = service["@context"] sc_canvas.sc_service_profile = service["profile"] - + sc_canvas.save! - + end rescue OpenURI::HTTPError print "WARNING:\tHTTP error accessing manifest #{sc_manifest.sc_id}\n" end - end + end end end