paperweb/cli/paperfind.sh

#!/bin/bash
# PAPERWEB - GPLv3 licence
# Copyright 2016 Yves Gablin
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#   -h This help.
#
# Query:
#   -Q [-d <pipe-separated list of YYYYMMDD/YYYYMM/YYYY prefixes to search>]
#      [-l <pipe-separated list of labels to match (all must be found)>]
#      [-k <pipe-separated list of keywords to match (all must be found)>]
#      [-i] : case-insensitive search
#   At least -d, -l, or -k must be used.
#   Result (WSN):
#     OUTPUT   = "[" [ DOC_LIST ] "]" .
#     DOC_LIST = DOC { "," DOC } .
#     DOC      = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] "],""count"":" COUNT ",""type"":" TYPE "}" .
#     FOLDER   = json_string .
#     LABELS   = json_string { "," json_string } .
#     COUNT    = json_number .
#     TYPE     = """pdf""" | """pages""" .
#
# Retrieve a document's thumbnails:
#   -T <date> : the folder-name of the document
#   Result (WSN):
#     OUTPUT   = "[" THUMBS "]" .
#     THUMBS   = CONTENTS { "," CONTENTS } .
#     CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" WIDTH ",""height"":" HEIGHT "}" .
#     MIME     = json_string .
#     B64_DATA = json_string .
#     WIDTH    = json_number .
#     HEIGHT   = json_number .
#
# Retrieve a document's page/PDF and metadata:
#   -D <date> -p <page number>
#   Result (WSN):
#     OUTPUT   = CONTENTS .
#   The "data" field contains the wanted page in JPEG format, except if the
#   document is a PDF file: then the whole PDF file is encoded.
#   Width and height should be ignored for PDF contents.
#
# Retrieve a document's page's metadata without the actual page:
#   -M <date> -p <page number>
#   Result (WSN):
#     OUTPUT   = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" .
#   Width and height should be ignored for PDF contents.
#
# Retrieve a raw document's page/PDF without metadata:
#   -R <date> -p <page number>
#   Result: page contents.
#   The returned data is the raw page in JPEG format, except if the document is
#   a PDF file: then the whole PDF file is returned.
#
# The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm"
# are both available. In this case, pages from PDF documents are treated the
# same way as pages from image-based documents.

##########  CONFIGURATION ########## 

BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY'
PDF_DPI=90

##### NO CHANGE PAST THIS LINE #####

# INIT
mode=
dates=()
labels=()
words=()
q_ci=
doc=
page=
{ type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true

# READ COMMAND LINE PARAMETERS
while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in
h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;;
Q) mode=Q ;;
d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;;
l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
i) q_ci=true ;;
T|D|M|R)
   mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;;
p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;;
esac; done

# CHECK COMMAND LINE PARAMETERS
case "$mode" in
Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done
   [ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;;
T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;;
D|M|R)
   [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3
   if [ -f "$BASE/$doc/doc.pdf" ]; then
     if [ -n "$pdfasjpg" ]; then
       maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}')
       [ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3
     fi
   else
     [ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3
   fi ;;
*) exit 1 ;;
esac

# RUN

# &0: raw string
# &1: json string
function json_string() {
  printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')"
}

# &0: image data
# &1: "<width> <height>"
function image_wh() {
  file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/'
}

cd "$BASE"
case "$mode" in
Q)
  found="$(
    case ${#dates[*]} in
    0) find . -mindepth 1 -maxdepth 1 -type d -printf '%P/|\n' ;;
    1) find ${dates[0]}* -maxdepth 0 -type d -printf '%p/|\n' ;;
    *) eval find {$(IFS=, eval echo '"${dates[*]}"')}\* -maxdepth 0 -type d -printf "'%p/|\\n'" ;;
    esac 2>/dev/null)"
  if [ ${#labels[*]} -gt 0 ]; then
    # This is an AND
    for l in "${labels[@]}"; do
      [ -n "$found" ] && found="$(
        eval grep -lF${q_ci:+i}e "$l" ${found//|/labels} 2>/dev/null \
        | sed 's#labels$#|#')"
    done
    ## That would be an OR (for later, perhaps)
    #found="$(
    #  eval grep -lF${q_ci:+i}f <(printf '%s\n' "${labels[@]}") ${found//|/labels} 2>/dev/null \
    #  | sed 's#labels$#|#')"
  fi
  if [ ${#words[*]} -gt 0 ] && [ -n "$found" ]; then
    # This is an AND
    tmpf="$(
      eval grep -Ho "'>[^<>]*</span'" ${found//|/paper.*.words} 2>/dev/null \
      | sed 's#</span$##;s#&lt;#<#g;s#&gt;#>#g;s#&amp;#\&#g')"
    for w in "${words[@]}"; do
      tmpf="$(grep -hF${q_ci:+i}e "$w" <<<"$tmpf")"
    done
    found="$(cut -d: -f1 <<<"$tmpf" | sed 's#paper\.[0-9]*\.words$#|#' | sort -u)"
    ## That would be an OR (for later, perhaps)
    #found="$(
    #  eval grep -Ho "'>[^<>]*</span'" ${found//|/paper.*.words} 2>/dev/null \
    #  | sed 's#</span$##;s#&lt;#<#g;s#&gt;#>#g;s#&amp;#\&#g' \
    #  | grep -hF${q_ci:+i}f <(printf '%s\n' "${words[@]}") \
    #  | cut -d: -f1 \
    #  | sed 's#paper\.[0-9]*\.words$#|#' \
    #  | sort -u)"
  fi
  printf '['
  while IFS=/ read folder nil; do if [ -n "$folder" ]; then
    [ -f $folder/doc.pdf ] && type=pdf || type=pages
    count=$(/bin/ls -1 $folder/paper.*.thumb.jpg 2>/dev/null | wc -l)
    labs="$(
      while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $folder/labels))"
    printf ',{"folder":%s,"labels":[%s],"count":%d,"type":"%s"}' \
      "$(json_string <<<"$folder")" "${labs:1}" $count "$type"
  fi; done < <(sort -r <<<"$found") | sed 's/^.//'
  printf ']'
  ;;
T)
  printf '['
  /bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \
  | while read t; do
    read w h < <(image_wh <$t)
    printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \
      "$(base64 --wrap=0 "$t" | json_string)" $w $h
  done | sed 's/^.//'
  printf ']'
  ;;
D|M)
  if [ -f $doc/doc.pdf ]; then
    p=$doc/doc.pdf
    if [ -n "$pdfasjpg" ]; then
      read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p | image_wh)
      mime='image/jpeg'
      cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p)
    else
      w=0; h=0; mime='application/pdf'
      cmd=(cat $p)
    fi
  else
    p=$doc/paper.$page.jpg
    read w h < <(image_wh <$p)
    mime='image/jpeg'
    cmd=(cat $p)
  fi
  if [ $mode == D ]; then
    printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \
      "$(json_string <<<"$mime")" "$("${cmd[@]}" | base64 --wrap=0 | json_string)" $w $h
  else
    printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h
  fi
  ;;
R)
  if [ -f $doc/doc.pdf ]; then
    if [ -n "$pdfasjpg" ]; then
      pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf
    else
      cat $doc/doc.pdf
    fi
  else
    cat $doc/paper.$page.jpg
  fi
  ;;
esac
Initial version. 2016-01-21 22:12:19 +01:00			`#!/bin/bash`
			`# PAPERWEB - GPLv3 licence`
			`# Copyright 2016 Yves Gablin`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`
			`# -h This help.`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`#`
Initial version. 2016-01-21 22:12:19 +01:00			`# Query:`
			`# -Q [-d <pipe-separated list of YYYYMMDD/YYYYMM/YYYY prefixes to search>]`
			`# [-l <pipe-separated list of labels to match (all must be found)>]`
			`# [-k <pipe-separated list of keywords to match (all must be found)>]`
			`# [-i] : case-insensitive search`
			`# At least -d, -l, or -k must be used.`
			`# Result (WSN):`
			`# OUTPUT = "[" [ DOC_LIST ] "]" .`
			`# DOC_LIST = DOC { "," DOC } .`
			`# DOC = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] "],""count"":" COUNT ",""type"":" TYPE "}" .`
			`# FOLDER = json_string .`
			`# LABELS = json_string { "," json_string } .`
			`# COUNT = json_number .`
			`# TYPE = """pdf""" \| """pages""" .`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`#`
Initial version. 2016-01-21 22:12:19 +01:00			`# Retrieve a document's thumbnails:`
			`# -T <date> : the folder-name of the document`
			`# Result (WSN):`
			`# OUTPUT = "[" THUMBS "]" .`
			`# THUMBS = CONTENTS { "," CONTENTS } .`
			`# CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" WIDTH ",""height"":" HEIGHT "}" .`
			`# MIME = json_string .`
			`# B64_DATA = json_string .`
			`# WIDTH = json_number .`
			`# HEIGHT = json_number .`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`#`
Initial version. 2016-01-21 22:12:19 +01:00			`# Retrieve a document's page/PDF and metadata:`
			`# -D <date> -p <page number>`
			`# Result (WSN):`
			`# OUTPUT = CONTENTS .`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`# The "data" field contains the wanted page in JPEG format, except if the`
			`# document is a PDF file: then the whole PDF file is encoded.`
Initial version. 2016-01-21 22:12:19 +01:00			`# Width and height should be ignored for PDF contents.`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`#`
			`# Retrieve a document's page's metadata without the actual page:`
			`# -M <date> -p <page number>`
			`# Result (WSN):`
			`# OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" .`
			`# Width and height should be ignored for PDF contents.`
			`#`
Initial version. 2016-01-21 22:12:19 +01:00			`# Retrieve a raw document's page/PDF without metadata:`
			`# -R <date> -p <page number>`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`# Result: page contents.`
			`# The returned data is the raw page in JPEG format, except if the document is`
			`# a PDF file: then the whole PDF file is returned.`
			`#`
			`# The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm"`
			`# are both available. In this case, pages from PDF documents are treated the`
			`# same way as pages from image-based documents.`
Initial version. 2016-01-21 22:12:19 +01:00
			`########## CONFIGURATION ##########`

			`BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY'`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`PDF_DPI=90`
Initial version. 2016-01-21 22:12:19 +01:00
			`##### NO CHANGE PAST THIS LINE #####`

			`# INIT`
			`mode=`
			`dates=()`
			`labels=()`
			`words=()`
			`q_ci=`
			`doc=`
			`page=`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`{ type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true`
Initial version. 2016-01-21 22:12:19 +01:00
			`# READ COMMAND LINE PARAMETERS`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in`
Initial version. 2016-01-21 22:12:19 +01:00			`h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;;`
			`Q) mode=Q ;;`
			`d) IFS='\|' read -a dates < <(tr -dc '\|[:digit:]' <<<"$OPTARG") ;;`
			`l) IFS='\|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;`
			`k) IFS='\|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;`
			`i) q_ci=true ;;`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`T\|D\|M\|R)`
Initial version. 2016-01-21 22:12:19 +01:00			`mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;;`
			`p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;;`
			`esac; done`

			`# CHECK COMMAND LINE PARAMETERS`
			`case "$mode" in`
			`Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] \|\| unset dates[i]; done`
			`[ -n "${dates[]}${labels[]}${words[*]}" ] \|\| exit 2 ;;`
			`T) [ -n "$doc" -a -d "$BASE/$doc" ] \|\| exit 3 ;;`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`D\|M\|R)`
Initial version. 2016-01-21 22:12:19 +01:00			`[ -n "$doc" -a -d "$BASE/$doc" ] \|\| exit 3`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`if [ -f "$BASE/$doc/doc.pdf" ]; then`
			`if [ -n "$pdfasjpg" ]; then`
			`maxp=$(pdfinfo "$BASE/$doc/doc.pdf" \| awk '/^Pages:/{print $2}')`
			`[ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] \|\| exit 3`
			`fi`
			`else`
			`[ -f "$BASE/$doc/paper.$page.jpg" ] \|\| exit 3`
			`fi ;;`
Initial version. 2016-01-21 22:12:19 +01:00			`*) exit 1 ;;`
			`esac`

			`# RUN`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00
			`# &0: raw string`
			`# &1: json string`
Initial version. 2016-01-21 22:12:19 +01:00			`function json_string() {`
			`printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')"`
			`}`

Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`# &0: image data`
			`# &1: "<width> <height>"`
			`function image_wh() {`
			`file -b - \| sed -r 's/., ([0-9]+)x([0-9]+),[^,]$/\1 \2/'`
			`}`

Initial version. 2016-01-21 22:12:19 +01:00			`cd "$BASE"`
			`case "$mode" in`
			`Q)`
			`found="$(`
			`case ${#dates[*]} in`
			`0) find . -mindepth 1 -maxdepth 1 -type d -printf '%P/\|\n' ;;`
			`1) find ${dates[0]}* -maxdepth 0 -type d -printf '%p/\|\n' ;;`
			`) eval find {$(IFS=, eval echo '"${dates[]}"')}\* -maxdepth 0 -type d -printf "'%p/\|\\n'" ;;`
			`esac 2>/dev/null)"`
			`if [ ${#labels[*]} -gt 0 ]; then`
			`# This is an AND`
			`for l in "${labels[@]}"; do`
Fix hang if non-existing label is searched. 2016-01-21 23:17:09 +01:00			`[ -n "$found" ] && found="$(`
Initial version. 2016-01-21 22:12:19 +01:00			`eval grep -lF${q_ci:+i}e "$l" ${found//\|/labels} 2>/dev/null \`
			`\| sed 's#labels$#\|#')"`
			`done`
			`## That would be an OR (for later, perhaps)`
			`#found="$(`
			`# eval grep -lF${q_ci:+i}f <(printf '%s\n' "${labels[@]}") ${found//\|/labels} 2>/dev/null \`
			`# \| sed 's#labels$#\|#')"`
			`fi`
Fix hang if non-existing label is searched. 2016-01-21 23:17:09 +01:00			`if [ ${#words[*]} -gt 0 ] && [ -n "$found" ]; then`
Initial version. 2016-01-21 22:12:19 +01:00			`# This is an AND`
			`tmpf="$(`
			`eval grep -Ho "'>[^<>]</span'" ${found//\|/paper..words} 2>/dev/null \`
			`\| sed 's#</span$##;s#<#<#g;s#>#>#g;s#&#\&#g')"`
			`for w in "${words[@]}"; do`
			`tmpf="$(grep -hF${q_ci:+i}e "$w" <<<"$tmpf")"`
			`done`
			`found="$(cut -d: -f1 <<<"$tmpf" \| sed 's#paper\.[0-9]*\.words$#\|#' \| sort -u)"`
			`## That would be an OR (for later, perhaps)`
			`#found="$(`
			`# eval grep -Ho "'>[^<>]</span'" ${found//\|/paper..words} 2>/dev/null \`
			`# \| sed 's#</span$##;s#<#<#g;s#>#>#g;s#&#\&#g' \`
			`# \| grep -hF${q_ci:+i}f <(printf '%s\n' "${words[@]}") \`
			`# \| cut -d: -f1 \`
			`# \| sed 's#paper\.[0-9]*\.words$#\|#' \`
			`# \| sort -u)"`
			`fi`
			`printf '['`
			`while IFS=/ read folder nil; do if [ -n "$folder" ]; then`
			`[ -f $folder/doc.pdf ] && type=pdf \|\| type=pages`
			`count=$(/bin/ls -1 $folder/paper.*.thumb.jpg 2>/dev/null \| wc -l)`
			`labs="$(`
			`while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $folder/labels))"`
			`printf ',{"folder":%s,"labels":[%s],"count":%d,"type":"%s"}' \`
			`"$(json_string <<<"$folder")" "${labs:1}" $count "$type"`
			`fi; done < <(sort -r <<<"$found") \| sed 's/^.//'`
			`printf ']'`
			`;;`
			`T)`
			`printf '['`
			`/bin/ls -1 $doc/paper.*.thumb.jpg \| sort -t. -k2,2n \`
			`\| while read t; do`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`read w h < <(image_wh <$t)`
Initial version. 2016-01-21 22:12:19 +01:00			`printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \`
			`"$(base64 --wrap=0 "$t" \| json_string)" $w $h`
			`done \| sed 's/^.//'`
			`printf ']'`
			`;;`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`D\|M)`
Initial version. 2016-01-21 22:12:19 +01:00			`if [ -f $doc/doc.pdf ]; then`
			`p=$doc/doc.pdf`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`if [ -n "$pdfasjpg" ]; then`
			`read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p \| image_wh)`
			`mime='image/jpeg'`
			`cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p)`
			`else`
			`w=0; h=0; mime='application/pdf'`
			`cmd=(cat $p)`
			`fi`
Initial version. 2016-01-21 22:12:19 +01:00			`else`
			`p=$doc/paper.$page.jpg`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`read w h < <(image_wh <$p)`
			`mime='image/jpeg'`
			`cmd=(cat $p)`
			`fi`
			`if [ $mode == D ]; then`
			`printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \`
			`"$(json_string <<<"$mime")" "$("${cmd[@]}" \| base64 --wrap=0 \| json_string)" $w $h`
			`else`
			`printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h`
Initial version. 2016-01-21 22:12:19 +01:00			`fi`
			`;;`
			`R)`
			`if [ -f $doc/doc.pdf ]; then`
Better handling of metadata, individual PDF pages 2016-01-26 22:51:15 +01:00			`if [ -n "$pdfasjpg" ]; then`
			`pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf`
			`else`
			`cat $doc/doc.pdf`
			`fi`
Initial version. 2016-01-21 22:12:19 +01:00			`else`
			`cat $doc/paper.$page.jpg`
			`fi`
			`;;`
			`esac`