#!/bin/bash set -e case $# in 1) ;; *) echo "OCR using AWS Textract." echo "Usage: textract " echo "Plain text output to stdout." exit 1 ;; esac JPEG="$1" URL="s3://textract.chezphil.org/${JPEG}" JSON=`mktemp` trap 's3cmd -q rm "${URL}"; rm "${JSON}"' 0 s3cmd -q put "${JPEG}" "${URL}" aws textract detect-document-text \ --document "{\"S3Object\":{\"Bucket\":\"textract.chezphil.org\",\"Name\":\"${JPEG}\"}}" \ > "${JSON}" # Output structure is quite complex but the important parts look like: # "BlockType": "LINE", # "Text": "Film 1", # There are other block types, specifically WORD, which we don't want. cat "${JSON}" | awk '/"BlockType": "LINE"/ { line=1 } /"BlockType": "WORD"/ { line=0 } /"Text"/ && line==1 { print substr($0, index($0,":")+1) }' | sed 's/^ *"\(.*\)",$/\1/' | sed 's/\\"/"/g'