From d1c5cae9f87a307df87319814f99b1d481c2571b Mon Sep 17 00:00:00 2001
From: Y <theYinYeti@yalis.fr>
Date: Tue, 26 Jan 2016 22:51:15 +0100
Subject: [PATCH] Better handling of metadata, individual PDF pages

---
 cli/paperfind.sh | 78 ++++++++++++++++++++++++++++++++++++++++--------
 web/paperweb.php | 28 ++++++++---------
 2 files changed, 79 insertions(+), 27 deletions(-)
diff --git a/cli/paperfind.sh b/cli/paperfind.sh
index fa2ca4f..b111198 100755
--- a/cli/paperfind.sh
+++ b/cli/paperfind.sh
@@ -13,6 +13,7 @@
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 #   -h This help.
+#
 # Query:
 #   -Q [-d <pipe-separated list of YYYYMMDD/YYYYMM/YYYY prefixes to search>]
 #      [-l <pipe-separated list of labels to match (all must be found)>]
@@ -27,6 +28,7 @@
 #     LABELS   = json_string { "," json_string } .
 #     COUNT    = json_number .
 #     TYPE     = """pdf""" | """pages""" .
+#
 # Retrieve a document's thumbnails:
 #   -T <date> : the folder-name of the document
 #   Result (WSN):
@@ -37,18 +39,35 @@
 #     B64_DATA = json_string .
 #     WIDTH    = json_number .
 #     HEIGHT   = json_number .
+#
 # Retrieve a document's page/PDF and metadata:
 #   -D <date> -p <page number>
 #   Result (WSN):
 #     OUTPUT   = CONTENTS .
+#   The "data" field contains the wanted page in JPEG format, except if the
+#   document is a PDF file: then the whole PDF file is encoded.
 #   Width and height should be ignored for PDF contents.
+#
+# Retrieve a document's page's metadata without the actual page:
+#   -M <date> -p <page number>
+#   Result (WSN):
+#     OUTPUT   = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" .
+#   Width and height should be ignored for PDF contents.
+#
 # Retrieve a raw document's page/PDF without metadata:
 #   -R <date> -p <page number>
-#   Result: file contents.
+#   Result: page contents.
+#   The returned data is the raw page in JPEG format, except if the document is
+#   a PDF file: then the whole PDF file is returned.
+#
+# The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm"
+# are both available. In this case, pages from PDF documents are treated the
+# same way as pages from image-based documents.
 
 ##########  CONFIGURATION ########## 
 
 BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY'
+PDF_DPI=90
 
 ##### NO CHANGE PAST THIS LINE #####
 
@@ -60,16 +79,17 @@ words=()
 q_ci=
 doc=
 page=
+{ type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true
 
 # READ COMMAND LINE PARAMETERS
-while getopts hQd:l:k:iT:D:p:R: opt; do case "$opt" in
+while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in
 h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;;
 Q) mode=Q ;;
 d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;;
 l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
 k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
 i) q_ci=true ;;
-T|D|R)
+T|D|M|R)
    mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;;
 p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;;
 esac; done
@@ -79,17 +99,33 @@ case "$mode" in
 Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done
    [ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;;
 T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;;
-D|R)
+D|M|R)
    [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3
-   [ -f "$BASE/$doc/doc.pdf" -o -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 ;;
+   if [ -f "$BASE/$doc/doc.pdf" ]; then
+     if [ -n "$pdfasjpg" ]; then
+       maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}')
+       [ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3
+     fi
+   else
+     [ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3
+   fi ;;
 *) exit 1 ;;
 esac
 
 # RUN
+
+# &0: raw string
+# &1: json string
 function json_string() {
   printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')"
 }
 
+# &0: image data
+# &1: "<width> <height>"
+function image_wh() {
+  file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/'
+}
+
 cd "$BASE"
 case "$mode" in
 Q)
@@ -144,27 +180,43 @@ T)
   printf '['
   /bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \
   | while read t; do
-    read w h < <(file -b "$t" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/')
+    read w h < <(image_wh <$t)
     printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \
       "$(base64 --wrap=0 "$t" | json_string)" $w $h
   done | sed 's/^.//'
   printf ']'
   ;;
-D)
+D|M)
   if [ -f $doc/doc.pdf ]; then
     p=$doc/doc.pdf
-    w=0; h=0
+    if [ -n "$pdfasjpg" ]; then
+      read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p | image_wh)
+      mime='image/jpeg'
+      cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p)
+    else
+      w=0; h=0; mime='application/pdf'
+      cmd=(cat $p)
+    fi
   else
     p=$doc/paper.$page.jpg
-    read w h < <(file -b "$p" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/')
+    read w h < <(image_wh <$p)
+    mime='image/jpeg'
+    cmd=(cat $p)
+  fi
+  if [ $mode == D ]; then
+    printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \
+      "$(json_string <<<"$mime")" "$("${cmd[@]}" | base64 --wrap=0 | json_string)" $w $h
+  else
+    printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h
   fi
-  mime=$(file -bi $p | cut -d';' -f1)
-  printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \
-    "$(json_string <<<"$mime")" "$(base64 --wrap=0 "$p" | json_string)" $w $h
   ;;
 R)
   if [ -f $doc/doc.pdf ]; then
-    cat $doc/doc.pdf
+    if [ -n "$pdfasjpg" ]; then
+      pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf
+    else
+      cat $doc/doc.pdf
+    fi
   else
     cat $doc/paper.$page.jpg
   fi
diff --git a/web/paperweb.php b/web/paperweb.php
index cc1a82f..c25ebb9 100644
--- a/web/paperweb.php
+++ b/web/paperweb.php
@@ -20,11 +20,16 @@ $USER='USER THAT SUDO WILL RUN paperfind.sh AS';
 ##### NO CHANGE PAST THIS LINE #####
 
 if (array_key_exists('doDownload', $_REQUEST)) {
-  $mime = (@$_REQUEST['type'] == 'pdf' ? 'application/pdf' : 'image/jpeg');
   $date = escapeshellarg(@$_REQUEST['date']);
   $page = escapeshellarg(@$_REQUEST['page']);
-  header("Content-Type: {$mime}");
-  passthru("sudo -u {$USER} {$PATH} -R {$date} -p {$page}");
+
+  # -M and -R are used instead of -D to avoid storing the data in RAM
+  $json = exec("sudo -u {$USER} {$PATH} -M {$date} -p {$page}");
+  if ($json) {
+    $meta = json_decode($json, true);
+    header("Content-Type: {$meta['mime']}");
+    passthru("sudo -u {$USER} {$PATH} -R {$date} -p {$page}");
+  }
 } else {
 ?>
 <!DOCTYPE html>
@@ -73,10 +78,9 @@ if (array_key_exists('doDownload', $_REQUEST)) {
     <input type="hidden" name="queryDone" value="<?php echo htmlentities($json); ?>"/>
 <?php
       foreach (json_decode($json, true) as $doc) {
-        $docId = $doc['folder'].'|'.$doc['type'];
 ?>
-    <button type="submit" name="doThumbnails" value="<?php echo htmlentities($docId); ?>"<?php
-        if ($docId == $current) {
+    <button type="submit" name="doThumbnails" value="<?php echo htmlentities($doc['folder']); ?>"<?php
+        if ($doc['folder'] == $current) {
           echo ' disabled="disabled"';
         }
     ?>>
@@ -99,27 +103,23 @@ if (array_key_exists('doDownload', $_REQUEST)) {
   <section id="thumbs">
 <?php
   if (array_key_exists('doThumbnails', $_REQUEST)) {
-    $do = explode('|', $_REQUEST['doThumbnails']);
-    $date = $do[0];
-    $type = $do[1];
+    $date = $_REQUEST['doThumbnails'];
     $datearg = escapeshellarg($date);
     $json = exec("sudo -u {$USER} {$PATH} -T {$datearg}");
   } else {
     $json = @$_REQUEST['thumbnailsDone'];
-    $do = explode('|', @$_REQUEST['currentDoc']);
-    $date = @$do[0];
-    $type = @$do[1];
+    $date = @$_REQUEST['currentDoc'];
   }
   if ($json) {
 ?>
     <h2>Pages</h2>
     <input type="hidden" name="thumbnailsDone" value="<?php echo htmlentities($json); ?>"/>
-    <input type="hidden" name="currentDoc" value="<?php echo htmlentities($date.'|'.$type); ?>"/>
+    <input type="hidden" name="currentDoc" value="<?php echo htmlentities($date); ?>"/>
 <?php
     foreach (json_decode($json, true) as $n => $p) {
       $nump = $n+1;
 ?>
-    <a target="_blank" href="?<?php echo htmlentities("doDownload=1&type={$type}&date={$date}&page={$nump}"); ?>"><img
+    <a target="_blank" href="?<?php echo htmlentities("doDownload=1&date={$date}&page={$nump}"); ?>"><img
       src="data:<?php echo $p['mime']; ?>;base64,<?php echo $p['data']; ?>"
       width="<?php echo $p['width']; ?>" height="<?php echo $p['height']; ?>"
       alt="Page <?php echo $nump; ?>"/></a>