diff --git a/cli/paperfind.sh b/cli/paperfind.sh index b111198..b1d6eb3 100755 --- a/cli/paperfind.sh +++ b/cli/paperfind.sh @@ -23,18 +23,26 @@ # Result (WSN): # OUTPUT = "[" [ DOC_LIST ] "]" . # DOC_LIST = DOC { "," DOC } . -# DOC = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] "],""count"":" COUNT ",""type"":" TYPE "}" . +# DOC = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] +# "],""count"":" COUNT ",""type"":" TYPE ",""etag"":" ETAG "}" . # FOLDER = json_string . # LABELS = json_string { "," json_string } . # COUNT = json_number . # TYPE = """pdf""" | """pages""" . +# ETAG = json_string . +# +# Retrieve a single document's metadata: +# -M +# Result (WSN): +# OUTPUT = DOC . # # Retrieve a document's thumbnails: # -T : the folder-name of the document # Result (WSN): # OUTPUT = "[" THUMBS "]" . # THUMBS = CONTENTS { "," CONTENTS } . -# CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" WIDTH ",""height"":" HEIGHT "}" . +# CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" +# WIDTH ",""height"":" HEIGHT ",""etag"":" ETAG "}" . # MIME = json_string . # B64_DATA = json_string . # WIDTH = json_number . @@ -48,10 +56,11 @@ # document is a PDF file: then the whole PDF file is encoded. # Width and height should be ignored for PDF contents. # -# Retrieve a document's page's metadata without the actual page: +# Retrieve a document page's metadata without the actual page: # -M -p # Result (WSN): -# OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" . +# OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" +# HEIGHT ",""etag"":" ETAG "}" . # Width and height should be ignored for PDF contents. # # Retrieve a raw document's page/PDF without metadata: @@ -101,29 +110,70 @@ Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;; D|M|R) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 - if [ -f "$BASE/$doc/doc.pdf" ]; then - if [ -n "$pdfasjpg" ]; then - maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}') - [ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3 + if [ $mode != M -o -n "$page" ]; then + if [ -f "$BASE/$doc/doc.pdf" ]; then + if [ -n "$pdfasjpg" ]; then + maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}') + [ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3 + fi + else + [ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 fi - else - [ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 fi ;; *) exit 1 ;; esac # RUN +# &0: image data +# &1: " " +function image_wh() { + file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/' +} + # &0: raw string # &1: json string function json_string() { printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')" } -# &0: image data -# &1: " " -function image_wh() { - file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/' +# $1: folder name (relative path) +# &1: json DOC +function json_doc() { + local type count labs lab nil etag + [ -f $1/doc.pdf ] && type=pdf || type=pages + count=$(/bin/ls -1 $1/paper.*.thumb.jpg 2>/dev/null | wc -l) + labs="$( + while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $1/labels))" + etag=$(find $1 -maxdepth 0 -printf '%T@') + printf '{"folder":%s,"labels":[%s],"count":%d,"type":"%s","etag":%s}' \ + "$(json_string <<<"$1")" "${labs:1}" $count "$type" "$(json_string <<<"$etag")" +} + +# $1: file path +#[$2: page number (if it must be extracted from a PDF)] +#[$3: "nodata"] +# &1: json CONTENTS +function json_contents() { + local mime w=0 h=0 etag + local -a cmd=(cat $1) + mime=$(file -bi "$1" | cut -d';' -f1) + if [ "$mime" != 'application/pdf' ]; then + read w h < <(image_wh <$1) + elif [ -n "$2" -a -n "$pdfasjpg" ]; then + cmd=(pdftoppm -r $PDF_DPI -jpeg -f $2 -l $2 $1) + mime=image/jpeg + read w h < <("${cmd[@]}" | image_wh) + fi + etag=$(find $1 -printf '%T@') + if [ -n "$3" ]; then + printf '{"mime":%s,"width":%d,"height":%d,"etag":%s}' \ + "$(json_string <<<"$mime")" $w $h "$(json_string <<<"$etag")" + else + printf '{"mime":%s,"width":%d,"height":%d,"etag":%s,"data":%s}' \ + "$(json_string <<<"$mime")" $w $h "$(json_string <<<"$etag")" \ + "$("${cmd[@]}" | base64 --wrap=0 | json_string)" + fi } cd "$BASE" @@ -167,12 +217,7 @@ Q) fi printf '[' while IFS=/ read folder nil; do if [ -n "$folder" ]; then - [ -f $folder/doc.pdf ] && type=pdf || type=pages - count=$(/bin/ls -1 $folder/paper.*.thumb.jpg 2>/dev/null | wc -l) - labs="$( - while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $folder/labels))" - printf ',{"folder":%s,"labels":[%s],"count":%d,"type":"%s"}' \ - "$(json_string <<<"$folder")" "${labs:1}" $count "$type" + printf ',%s' "$(json_doc $folder)" fi; done < <(sort -r <<<"$found") | sed 's/^.//' printf ']' ;; @@ -180,34 +225,17 @@ T) printf '[' /bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \ | while read t; do - read w h < <(image_wh <$t) - printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \ - "$(base64 --wrap=0 "$t" | json_string)" $w $h + printf ',%s' "$(json_contents $t)" done | sed 's/^.//' printf ']' ;; D|M) - if [ -f $doc/doc.pdf ]; then - p=$doc/doc.pdf - if [ -n "$pdfasjpg" ]; then - read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p | image_wh) - mime='image/jpeg' - cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p) - else - w=0; h=0; mime='application/pdf' - cmd=(cat $p) - fi + if [ -z "$page" ]; then + json_doc $doc + elif [ -f $doc/doc.pdf ]; then + json_contents $doc/doc.pdf $page ${mode/D} else - p=$doc/paper.$page.jpg - read w h < <(image_wh <$p) - mime='image/jpeg' - cmd=(cat $p) - fi - if [ $mode == D ]; then - printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \ - "$(json_string <<<"$mime")" "$("${cmd[@]}" | base64 --wrap=0 | json_string)" $w $h - else - printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h + json_contents $doc/paper.$page.jpg '' ${mode/D} fi ;; R)