Library/Formula/tesseract.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

require 'formula'

class TesseractEnglishData <Formula
  url 'http://tesseract-ocr.googlecode.com/files/tesseract-2.00.eng.tar.gz'
  md5 'b8291d6b3a63ce7879d688e845e341a9'
  version '2.00'
end

class Tesseract <Formula
  url 'http://tesseract-ocr.googlecode.com/files/tesseract-2.04.tar.gz'
  homepage 'http://code.google.com/p/tesseract-ocr/'
  md5 'b44eba1a9f4892ac62e484c807fe0533'

  depends_on 'libtiff'

  def about
    <<-EOF
    Tesseract is an OCR (Optical Character Recognition) engine.

    The easiest way to use it is to convert the source to a Grayscale tiff:
    `convert source.png -type Grayscale terre_input.tif`
    then run tesseract:
    `tesseract terre_input.tif output`
    EOF
  end

  def install
    # Executable 'tesseract' segfaults on 10.6 when compiled with llvm-gcc (LLVM build 2206)
    ENV.gcc_4_2

    # 'make install' expects the language data files in the build directory
    d = Dir.getwd
    TesseractEnglishData.new.brew { FileUtils.cp Dir["*"], "#{d}/tessdata/" }

    system "./configure", "--prefix=#{prefix}", "--disable-debug", "--disable-dependency-tracking"
    system "make install"
  end
end