Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,11 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
<tt>aptitude install libreoffice</tt><br />
On the Mac, download and install <a href="http://www.libreoffice.org/download">the latest release</a>.
</li>
<li>
(Optional) Install <a href="http://www.gnu.org/software/parallel/">Parallel</a>:<br>
<tt>[aptitude | port | brew] install parallel</tt><br>
Parallel speeds up OCR text extraction of documents by processing pages in parallel.
</li>
</ol>

<p><i>
Expand Down
8 changes: 4 additions & 4 deletions lib/docsplit.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ module Docsplit
ESCAPED_ROOT = ESCAPE[ROOT]

METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]

GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]

DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]

DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false, :parallel => false}

# Check for all dependencies, and note their absence.
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
DEPENDENCIES.each_key do |dep|
Expand Down Expand Up @@ -75,7 +75,7 @@ def self.extract_#{key}(pdfs, opts={})
end
EOS
end

def self.extract_info(pdfs, opts={})
pdfs = ensure_pdfs(pdfs)
InfoExtractor.new.extract_all(pdfs, opts)
Expand Down
18 changes: 11 additions & 7 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,22 @@ def extract_from_ocr(pdf, pages)
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
if DEPENDENCIES[:parallel]
run "MAGICK_TMPDIR=#{tempdir} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{tempdir}/page_%d.tiff 2>&1"
run "parallel tesseract -l #{@language} #{psm} {} {.} ::: #{tempdir}/page_*.tiff 2>&1"
run "cat #{tempdir}/page_*.txt >'#{base_path}.txt' 2>&1"
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
end
clean_text("#{base_path}.txt") if @clean_ocr
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end


private

def clean_text(file)
Expand Down