documentcloud · narutosanjiv · Jul 14, 2014 · Jul 25, 2014
diff --git a/README b/README
@@ -14,9 +14,15 @@
   Installation:
   gem install docsplit
 
+  Added the options:
+    pdf_opts: which can be used to passed the pdftotext binary file options to docsplit gem
+    For Example:
+      Passing raw options to pdftotext, 
+        Docsplit.extract_text(path, {:pdf_opts => '-raw'})
+
   For documentation, usage, and examples, see:
   http://documentcloud.github.com/docsplit/
 
   To suggest a feature or report a bug: 
   http://github.com/documentcloud/docsplit/issues/
-
+  
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -103,14 +103,22 @@ def run(command)
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @pdf_txt_opts.empty?
+        run "pdftotext  -enc UTF-8 #{@pdf_txt_opts}  #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      else
+        run "pdftotext  -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      end
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @pdf_txt_opts.empty?
+        run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{@pdf_txt_opts}  #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      else
+        run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      end
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
@@ -123,8 +131,9 @@ def extract_options(options)
       @forbid_ocr = options[:ocr] == false
       @clean_ocr  = !(options[:clean] == false)
       @language   = options[:language] || 'eng'
+      @pdf_txt_opts = options[:pdf_opts] || '' 
     end
 
   end
 
-end
+end
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
@@ -53,5 +53,9 @@ def test_name_escaping_while_extracting_text
     Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 2
   end
-
+
+  def test_name_escaping_while_extracting_text_with_pdf_opts
+    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', {:pages => 'all', :output => OUTPUT, :pdf_opts => '-raw'})
+    assert Dir["#{OUTPUT}/*.txt"].length == 2
+  end
 end