#!/bin/bash

set -e

case $# in
1) ;;
*) echo "OCR using AWS Textract."
   echo "Usage: textract <file.jpeg>"
   echo "Plain text output to stdout."
   exit 1 ;;
esac

JPEG="$1"
URL="s3://textract.chezphil.org/${JPEG}"
JSON=`mktemp`

trap 's3cmd -q rm "${URL}"; rm "${JSON}"' 0

s3cmd -q put "${JPEG}" "${URL}"

aws textract detect-document-text \
    --document "{\"S3Object\":{\"Bucket\":\"textract.chezphil.org\",\"Name\":\"${JPEG}\"}}" \
    > "${JSON}"

# Output structure is quite complex but the important parts look like:
#    "BlockType": "LINE",
#    "Text": "Film 1",
# There are other block types, specifically WORD, which we don't want.

cat "${JSON}" |
awk '/"BlockType": "LINE"/ { line=1 }
     /"BlockType": "WORD"/ { line=0 }
     /"Text"/ && line==1   { print substr($0, index($0,":")+1) }' |
sed 's/^ *"\(.*\)",$/\1/' |
sed 's/\\"/"/g'