From d1c5cae9f87a307df87319814f99b1d481c2571b Mon Sep 17 00:00:00 2001 From: Y Date: Tue, 26 Jan 2016 22:51:15 +0100 Subject: [PATCH] Better handling of metadata, individual PDF pages --- cli/paperfind.sh | 78 ++++++++++++++++++++++++++++++++++++++++-------- web/paperweb.php | 28 ++++++++--------- 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/cli/paperfind.sh b/cli/paperfind.sh index fa2ca4f..b111198 100755 --- a/cli/paperfind.sh +++ b/cli/paperfind.sh @@ -13,6 +13,7 @@ # along with this program. If not, see . # # -h This help. +# # Query: # -Q [-d ] # [-l ] @@ -27,6 +28,7 @@ # LABELS = json_string { "," json_string } . # COUNT = json_number . # TYPE = """pdf""" | """pages""" . +# # Retrieve a document's thumbnails: # -T : the folder-name of the document # Result (WSN): @@ -37,18 +39,35 @@ # B64_DATA = json_string . # WIDTH = json_number . # HEIGHT = json_number . +# # Retrieve a document's page/PDF and metadata: # -D -p # Result (WSN): # OUTPUT = CONTENTS . +# The "data" field contains the wanted page in JPEG format, except if the +# document is a PDF file: then the whole PDF file is encoded. # Width and height should be ignored for PDF contents. +# +# Retrieve a document's page's metadata without the actual page: +# -M -p +# Result (WSN): +# OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" . +# Width and height should be ignored for PDF contents. +# # Retrieve a raw document's page/PDF without metadata: # -R -p -# Result: file contents. +# Result: page contents. +# The returned data is the raw page in JPEG format, except if the document is +# a PDF file: then the whole PDF file is returned. +# +# The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm" +# are both available. In this case, pages from PDF documents are treated the +# same way as pages from image-based documents. ########## CONFIGURATION ########## BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY' +PDF_DPI=90 ##### NO CHANGE PAST THIS LINE ##### @@ -60,16 +79,17 @@ words=() q_ci= doc= page= +{ type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true # READ COMMAND LINE PARAMETERS -while getopts hQd:l:k:iT:D:p:R: opt; do case "$opt" in +while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;; Q) mode=Q ;; d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;; l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;; k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;; i) q_ci=true ;; -T|D|R) +T|D|M|R) mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;; p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;; esac; done @@ -79,17 +99,33 @@ case "$mode" in Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done [ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;; T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;; -D|R) +D|M|R) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 - [ -f "$BASE/$doc/doc.pdf" -o -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 ;; + if [ -f "$BASE/$doc/doc.pdf" ]; then + if [ -n "$pdfasjpg" ]; then + maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}') + [ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3 + fi + else + [ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 + fi ;; *) exit 1 ;; esac # RUN + +# &0: raw string +# &1: json string function json_string() { printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')" } +# &0: image data +# &1: " " +function image_wh() { + file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/' +} + cd "$BASE" case "$mode" in Q) @@ -144,27 +180,43 @@ T) printf '[' /bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \ | while read t; do - read w h < <(file -b "$t" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/') + read w h < <(image_wh <$t) printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \ "$(base64 --wrap=0 "$t" | json_string)" $w $h done | sed 's/^.//' printf ']' ;; -D) +D|M) if [ -f $doc/doc.pdf ]; then p=$doc/doc.pdf - w=0; h=0 + if [ -n "$pdfasjpg" ]; then + read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p | image_wh) + mime='image/jpeg' + cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p) + else + w=0; h=0; mime='application/pdf' + cmd=(cat $p) + fi else p=$doc/paper.$page.jpg - read w h < <(file -b "$p" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/') + read w h < <(image_wh <$p) + mime='image/jpeg' + cmd=(cat $p) + fi + if [ $mode == D ]; then + printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \ + "$(json_string <<<"$mime")" "$("${cmd[@]}" | base64 --wrap=0 | json_string)" $w $h + else + printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h fi - mime=$(file -bi $p | cut -d';' -f1) - printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \ - "$(json_string <<<"$mime")" "$(base64 --wrap=0 "$p" | json_string)" $w $h ;; R) if [ -f $doc/doc.pdf ]; then - cat $doc/doc.pdf + if [ -n "$pdfasjpg" ]; then + pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf + else + cat $doc/doc.pdf + fi else cat $doc/paper.$page.jpg fi diff --git a/web/paperweb.php b/web/paperweb.php index cc1a82f..c25ebb9 100644 --- a/web/paperweb.php +++ b/web/paperweb.php @@ -20,11 +20,16 @@ $USER='USER THAT SUDO WILL RUN paperfind.sh AS'; ##### NO CHANGE PAST THIS LINE ##### if (array_key_exists('doDownload', $_REQUEST)) { - $mime = (@$_REQUEST['type'] == 'pdf' ? 'application/pdf' : 'image/jpeg'); $date = escapeshellarg(@$_REQUEST['date']); $page = escapeshellarg(@$_REQUEST['page']); - header("Content-Type: {$mime}"); - passthru("sudo -u {$USER} {$PATH} -R {$date} -p {$page}"); + + # -M and -R are used instead of -D to avoid storing the data in RAM + $json = exec("sudo -u {$USER} {$PATH} -M {$date} -p {$page}"); + if ($json) { + $meta = json_decode($json, true); + header("Content-Type: {$meta['mime']}"); + passthru("sudo -u {$USER} {$PATH} -R {$date} -p {$page}"); + } } else { ?> @@ -73,10 +78,9 @@ if (array_key_exists('doDownload', $_REQUEST)) { -