#!/bin/bash # PAPERWEB - GPLv3 licence # Copyright 2016 Yves Gablin # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # -h This help. # # Query: # -Q [-d ] # [-l ] # [-k ] # [-i] : case-insensitive search # At least -d, -l, or -k must be used. # Result (WSN): # OUTPUT = "[" [ DOC_LIST ] "]" . # DOC_LIST = DOC { "," DOC } . # DOC = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] # "],""count"":" COUNT ",""type"":" TYPE ",""etag"":" ETAG "}" . # FOLDER = json_string . # LABELS = json_string { "," json_string } . # COUNT = json_number . # TYPE = """pdf""" | """pages""" . # ETAG = json_string . # # Retrieve a single document's metadata: # -M # Result (WSN): # OUTPUT = DOC . # # Retrieve a document's thumbnails: # -T : the folder-name of the document # Result (WSN): # OUTPUT = "[" THUMBS "]" . # THUMBS = CONTENTS { "," CONTENTS } . # CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" # WIDTH ",""height"":" HEIGHT ",""etag"":" ETAG "}" . # MIME = json_string . # B64_DATA = json_string . # WIDTH = json_number . # HEIGHT = json_number . # # Retrieve a document's page/PDF and metadata: # -D -p # Result (WSN): # OUTPUT = CONTENTS . # The "data" field contains the wanted page in JPEG format, except if the # document is a PDF file: then the whole PDF file is encoded. # Width and height should be ignored for PDF contents. # # Retrieve a document page's metadata without the actual page: # -M -p # Result (WSN): # OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" # HEIGHT ",""etag"":" ETAG "}" . # Width and height should be ignored for PDF contents. # # Retrieve a raw document's page/PDF without metadata: # -R -p # Result: page contents. # The returned data is the raw page in JPEG format, except if the document is # a PDF file: then the whole PDF file is returned. # # The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm" # are both available. In this case, pages from PDF documents are treated the # same way as pages from image-based documents. ########## CONFIGURATION ########## BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY' PDF_DPI=90 ##### NO CHANGE PAST THIS LINE ##### # INIT mode= dates=() labels=() words=() q_ci= doc= page= { type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true # READ COMMAND LINE PARAMETERS while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;; Q) mode=Q ;; d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;; l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;; k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;; i) q_ci=true ;; T|D|M|R) mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;; p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;; esac; done # CHECK COMMAND LINE PARAMETERS case "$mode" in Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done [ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;; T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;; D|M|R) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 if [ $mode != M -o -n "$page" ]; then if [ -f "$BASE/$doc/doc.pdf" ]; then if [ -n "$pdfasjpg" ]; then maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}') [ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3 fi else [ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 fi fi ;; *) exit 1 ;; esac # RUN # &0: image data # &1: " " function image_wh() { file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/' } # &0: raw string # &1: json string function json_string() { printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')" } # $1: folder name (relative path) # &1: json DOC function json_doc() { local type count labs lab nil etag [ -f $1/doc.pdf ] && type=pdf || type=pages count=$(/bin/ls -1 $1/paper.*.thumb.jpg 2>/dev/null | wc -l) labs="$( while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $1/labels))" etag=$(find $1 -maxdepth 0 -printf '%T@') printf '{"folder":%s,"labels":[%s],"count":%d,"type":"%s","etag":%s}' \ "$(json_string <<<"$1")" "${labs:1}" $count "$type" "$(json_string <<<"$etag")" } # $1: file path #[$2: page number (if it must be extracted from a PDF)] #[$3: "nodata"] # &1: json CONTENTS function json_contents() { local mime w=0 h=0 etag local -a cmd=(cat $1) mime=$(file -bi "$1" | cut -d';' -f1) if [ "$mime" != 'application/pdf' ]; then read w h < <(image_wh <$1) elif [ -n "$2" -a -n "$pdfasjpg" ]; then cmd=(pdftoppm -r $PDF_DPI -jpeg -f $2 -l $2 $1) mime=image/jpeg read w h < <("${cmd[@]}" | image_wh) fi etag=$(find $1 -printf '%T@') if [ -n "$3" ]; then printf '{"mime":%s,"width":%d,"height":%d,"etag":%s}' \ "$(json_string <<<"$mime")" $w $h "$(json_string <<<"$etag")" else printf '{"mime":%s,"width":%d,"height":%d,"etag":%s,"data":%s}' \ "$(json_string <<<"$mime")" $w $h "$(json_string <<<"$etag")" \ "$("${cmd[@]}" | base64 --wrap=0 | json_string)" fi } cd "$BASE" case "$mode" in Q) found="$( case ${#dates[*]} in 0) find . -mindepth 1 -maxdepth 1 -type d -printf '%P/|\n' ;; 1) find ${dates[0]}* -maxdepth 0 -type d -printf '%p/|\n' ;; *) eval find {$(IFS=, eval echo '"${dates[*]}"')}\* -maxdepth 0 -type d -printf "'%p/|\\n'" ;; esac 2>/dev/null)" if [ ${#labels[*]} -gt 0 ]; then # This is an AND for l in "${labels[@]}"; do [ -n "$found" ] && found="$( eval grep -lF${q_ci:+i}e "$l" ${found//|/labels} 2>/dev/null \ | sed 's#labels$#|#')" done ## That would be an OR (for later, perhaps) #found="$( # eval grep -lF${q_ci:+i}f <(printf '%s\n' "${labels[@]}") ${found//|/labels} 2>/dev/null \ # | sed 's#labels$#|#')" fi if [ ${#words[*]} -gt 0 ] && [ -n "$found" ]; then # This is an AND tmpf="$( eval grep -Ho "'>[^<>]*/dev/null \ | sed 's##g;s#&#\&#g')" for w in "${words[@]}"; do tmpf="$(grep -hF${q_ci:+i}e "$w" <<<"$tmpf")" done found="$(cut -d: -f1 <<<"$tmpf" | sed 's#paper\.[0-9]*\.words$#|#' | sort -u)" ## That would be an OR (for later, perhaps) #found="$( # eval grep -Ho "'>[^<>]*/dev/null \ # | sed 's##g;s#&#\&#g' \ # | grep -hF${q_ci:+i}f <(printf '%s\n' "${words[@]}") \ # | cut -d: -f1 \ # | sed 's#paper\.[0-9]*\.words$#|#' \ # | sort -u)" fi printf '[' while IFS=/ read folder nil; do if [ -n "$folder" ]; then printf ',%s' "$(json_doc $folder)" fi; done < <(sort -r <<<"$found") | sed 's/^.//' printf ']' ;; T) printf '[' /bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \ | while read t; do printf ',%s' "$(json_contents $t)" done | sed 's/^.//' printf ']' ;; D|M) if [ -z "$page" ]; then json_doc $doc elif [ -f $doc/doc.pdf ]; then json_contents $doc/doc.pdf $page ${mode/D} else json_contents $doc/paper.$page.jpg '' ${mode/D} fi ;; R) if [ -f $doc/doc.pdf ]; then if [ -n "$pdfasjpg" ]; then pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf else cat $doc/doc.pdf fi else cat $doc/paper.$page.jpg fi ;; esac