Jul 17, 2018

just simple ocr

$ sudo apt-get update
$ sudo apt-get install autoconf automake libtool
$ sudo apt-get install libpng12-dev
$ sudo apt-get install libjpeg62-dev
$ sudo apt-get install g++
$ sudo apt-get install libtiff4-dev
$ sudo apt-get install libopencv-dev libtesseract-dev
$ sudo apt-get install git
$ sudo apt-get install cmake
$ sudo apt-get install build-essential
$ sudo apt-get install libleptonica-dev
$ sudo apt-get install liblog4cplus-dev
$ sudo apt-get install libcurl3-dev
$ sudo apt-get install python2.7-dev
$ sudo apt-get install tk8.5 tcl8.5 tk8.5-dev tcl8.5-dev
$ sudo apt-get build-dep python-imaging --fix-missing
$ sudo apt-get install imagemagick

sudo apt-get build-dep python-imaging --fix-missing



$ wget http://www.leptonica.com/source/leptonica-1.76.0.tar.gz
tar -zxvf leptonica-1.76.0.tar.gz
$ cd leptonica-1.76.0/
$ ./autobuild
$ ./configure
$ make
$ sudo make install
$ sudo ldconfig


wget https://github.com/tesseract-ocr/tesseract/archive/3.05.02.tar.gz
$ tar -zxvf 3.05.02.tar.gz
$ cd tesseract-3.05.02/
$ ./autogen.sh
$ ./configure
$ make
$ sudo make install
$ sudo ldconfig

export TESSDATA_PREFIX=/usr/local/share/
sudo cp -r tessdata $TESSDATA_PREFIX

wget http://tessdata.projectnaptha.com/3.02/eng.traineddata.gz
gunzip eng.traineddata.gz
sudo cp -r eng.traineddata $TESSDATA_PREFIX


wget https://github.com/rhgraysonii/ocr_tutorial/archive/v0.tar.gz
$ tar -xf v0.tar.gz
cd python_ocr_tutorial-0/
sudo apt-get install python-virtualenv
$ virtualenv env
$ source env/bin/activate
$ pip install -r requirements.txt


ocr.py
===================
import pytesseract
import requests
from PIL import Image
from PIL import ImageFilter
from StringIO import StringIO


def process_image(url):
    image = _get_image(url)
    image.filter(ImageFilter.SHARPEN)
    return pytesseract.image_to_string(image)


def _get_image(url):
    return Image.open(StringIO(requests.get(url, verify=False).content))




curl -X POST http://localhost:5000/v1/ocr -d '{"image_url": "https://ecs7.tokopedia.net/img/product-1/2015/5/6/329033/329033_ea3477c4-f38e-11e4-ba78-9dda64efb121.jpg" }' -H "Content-Type: application/json"


cli.py
====================



import sys
import requests
import pytesseract
from PIL import Image
from StringIO import StringIO


def get_image(url):
    return Image.open(StringIO(requests.get(url).content))


if __name__ == '__main__':
    """Tool to test the raw output of pytesseract with a given input URL"""
    sys.stdout.write("""
===OOOO=====CCCCC===RRRRRR=====\n
==OO==OO===CC=======RR===RR====\n
==OO==OO===CC=======RR===RR====\n
==OO==OO===CC=======RRRRRR=====\n
==OO==OO===CC=======RR==RR=====\n
==OO==OO===CC=======RR== RR====\n
===OOOO=====CCCCC===RR====RR===\n\n
""")
    sys.stdout.write("A simple OCR utility\n")
    url = raw_input("What is the url of the image you would like to analyze?\n")
    image = get_image(url)
    sys.stdout.write("The raw output from tesseract with no processing is:\n\n")
    sys.stdout.write("-----------------BEGIN-----------------\n")
    sys.stdout.write(pytesseract.image_to_string(image) + "\n")
    sys.stdout.write("------------------END------------------\n")



source : https://realpython.com/setting-up-a-simple-ocr-server/



.
$ sudo apt-get update
$ sudo apt-get install autoconf automake libtool
$ sudo apt-get install libpng12-dev
$ sudo apt-get install libjpeg62-dev
$ sudo apt-get install g++
$ sudo apt-get install libtiff4-dev
$ sudo apt-get install libopencv-dev libtesseract-dev
$ sudo apt-get install git
$ sudo apt-get install cmake
$ sudo apt-get install build-essential
$ sudo apt-get install libleptonica-dev
$ sudo apt-get install liblog4cplus-dev
$ sudo apt-get install libcurl3-dev
$ sudo apt-get install python2.7-dev
$ sudo apt-get install tk8.5 tcl8.5 tk8.5-dev tcl8.5-dev
$ sudo apt-get build-dep python-imaging --fix-missing
$ sudo apt-get install imagemagick

sudo apt-get build-dep python-imaging --fix-missing



$ wget http://www.leptonica.com/source/leptonica-1.76.0.tar.gz
tar -zxvf leptonica-1.76.0.tar.gz
$ cd leptonica-1.76.0/
$ ./autobuild
$ ./configure
$ make
$ sudo make install
$ sudo ldconfig


wget https://github.com/tesseract-ocr/tesseract/archive/3.05.02.tar.gz
$ tar -zxvf 3.05.02.tar.gz
$ cd tesseract-3.05.02/
$ ./autogen.sh
$ ./configure
$ make
$ sudo make install
$ sudo ldconfig

export TESSDATA_PREFIX=/usr/local/share/
sudo cp -r tessdata $TESSDATA_PREFIX

wget http://tessdata.projectnaptha.com/3.02/eng.traineddata.gz
gunzip eng.traineddata.gz
sudo cp -r eng.traineddata $TESSDATA_PREFIX


wget https://github.com/rhgraysonii/ocr_tutorial/archive/v0.tar.gz
$ tar -xf v0.tar.gz
cd python_ocr_tutorial-0/
sudo apt-get install python-virtualenv
$ virtualenv env
$ source env/bin/activate
$ pip install -r requirements.txt


ocr.py
===================
import pytesseract
import requests
from PIL import Image
from PIL import ImageFilter
from StringIO import StringIO


def process_image(url):
    image = _get_image(url)
    image.filter(ImageFilter.SHARPEN)
    return pytesseract.image_to_string(image)


def _get_image(url):
    return Image.open(StringIO(requests.get(url, verify=False).content))




curl -X POST http://localhost:5000/v1/ocr -d '{"image_url": "https://ecs7.tokopedia.net/img/product-1/2015/5/6/329033/329033_ea3477c4-f38e-11e4-ba78-9dda64efb121.jpg" }' -H "Content-Type: application/json"


cli.py
====================



import sys
import requests
import pytesseract
from PIL import Image
from StringIO import StringIO


def get_image(url):
    return Image.open(StringIO(requests.get(url).content))


if __name__ == '__main__':
    """Tool to test the raw output of pytesseract with a given input URL"""
    sys.stdout.write("""
===OOOO=====CCCCC===RRRRRR=====\n
==OO==OO===CC=======RR===RR====\n
==OO==OO===CC=======RR===RR====\n
==OO==OO===CC=======RRRRRR=====\n
==OO==OO===CC=======RR==RR=====\n
==OO==OO===CC=======RR== RR====\n
===OOOO=====CCCCC===RR====RR===\n\n
""")
    sys.stdout.write("A simple OCR utility\n")
    url = raw_input("What is the url of the image you would like to analyze?\n")
    image = get_image(url)
    sys.stdout.write("The raw output from tesseract with no processing is:\n\n")
    sys.stdout.write("-----------------BEGIN-----------------\n")
    sys.stdout.write(pytesseract.image_to_string(image) + "\n")
    sys.stdout.write("------------------END------------------\n")



source : https://realpython.com/setting-up-a-simple-ocr-server/


No comments:

Post a Comment