#!/bin/bash # PAPERWEB - GPLv3 licence # Copyright 2016 Yves Gablin # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # -h This help. # Query: # -Q [-d ] # [-l ] # [-k ] # [-i] : case-insensitive search # At least -d, -l, or -k must be used. # Result (WSN): # OUTPUT = "[" [ DOC_LIST ] "]" . # DOC_LIST = DOC { "," DOC } . # DOC = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] "],""count"":" COUNT ",""type"":" TYPE "}" . # FOLDER = json_string . # LABELS = json_string { "," json_string } . # COUNT = json_number . # TYPE = """pdf""" | """pages""" . # Retrieve a document's thumbnails: # -T : the folder-name of the document # Result (WSN): # OUTPUT = "[" THUMBS "]" . # THUMBS = CONTENTS { "," CONTENTS } . # CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" WIDTH ",""height"":" HEIGHT "}" . # MIME = json_string . # B64_DATA = json_string . # WIDTH = json_number . # HEIGHT = json_number . # Retrieve a document's page/PDF and metadata: # -D -p # Result (WSN): # OUTPUT = CONTENTS . # Width and height should be ignored for PDF contents. # Retrieve a raw document's page/PDF without metadata: # -R -p # Result: file contents. ########## CONFIGURATION ########## BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY' ##### NO CHANGE PAST THIS LINE ##### # INIT mode= dates=() labels=() words=() q_ci= doc= page= # READ COMMAND LINE PARAMETERS while getopts hQd:l:k:iT:D:p:R: opt; do case "$opt" in h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;; Q) mode=Q ;; d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;; l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;; k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;; i) q_ci=true ;; T|D|R) mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;; p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;; esac; done # CHECK COMMAND LINE PARAMETERS case "$mode" in Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done [ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;; T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;; D|R) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 [ -f "$BASE/$doc/doc.pdf" -o -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 ;; *) exit 1 ;; esac # RUN function json_string() { printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')" } cd "$BASE" case "$mode" in Q) found="$( case ${#dates[*]} in 0) find . -mindepth 1 -maxdepth 1 -type d -printf '%P/|\n' ;; 1) find ${dates[0]}* -maxdepth 0 -type d -printf '%p/|\n' ;; *) eval find {$(IFS=, eval echo '"${dates[*]}"')}\* -maxdepth 0 -type d -printf "'%p/|\\n'" ;; esac 2>/dev/null)" if [ ${#labels[*]} -gt 0 ]; then # This is an AND for l in "${labels[@]}"; do found="$( eval grep -lF${q_ci:+i}e "$l" ${found//|/labels} 2>/dev/null \ | sed 's#labels$#|#')" done ## That would be an OR (for later, perhaps) #found="$( # eval grep -lF${q_ci:+i}f <(printf '%s\n' "${labels[@]}") ${found//|/labels} 2>/dev/null \ # | sed 's#labels$#|#')" fi if [ ${#words[*]} -gt 0 ]; then # This is an AND tmpf="$( eval grep -Ho "'>[^<>]*/dev/null \ | sed 's##g;s#&#\&#g')" for w in "${words[@]}"; do tmpf="$(grep -hF${q_ci:+i}e "$w" <<<"$tmpf")" done found="$(cut -d: -f1 <<<"$tmpf" | sed 's#paper\.[0-9]*\.words$#|#' | sort -u)" ## That would be an OR (for later, perhaps) #found="$( # eval grep -Ho "'>[^<>]*/dev/null \ # | sed 's##g;s#&#\&#g' \ # | grep -hF${q_ci:+i}f <(printf '%s\n' "${words[@]}") \ # | cut -d: -f1 \ # | sed 's#paper\.[0-9]*\.words$#|#' \ # | sort -u)" fi printf '[' while IFS=/ read folder nil; do if [ -n "$folder" ]; then [ -f $folder/doc.pdf ] && type=pdf || type=pages count=$(/bin/ls -1 $folder/paper.*.thumb.jpg 2>/dev/null | wc -l) labs="$( while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $folder/labels))" printf ',{"folder":%s,"labels":[%s],"count":%d,"type":"%s"}' \ "$(json_string <<<"$folder")" "${labs:1}" $count "$type" fi; done < <(sort -r <<<"$found") | sed 's/^.//' printf ']' ;; T) printf '[' /bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \ | while read t; do read w h < <(file -b "$t" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/') printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \ "$(base64 --wrap=0 "$t" | json_string)" $w $h done | sed 's/^.//' printf ']' ;; D) if [ -f $doc/doc.pdf ]; then p=$doc/doc.pdf w=0; h=0 else p=$doc/paper.$page.jpg read w h < <(file -b "$p" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/') fi mime=$(file -bi $p | cut -d';' -f1) printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \ "$(json_string <<<"$mime")" "$(base64 --wrap=0 "$p" | json_string)" $w $h ;; R) if [ -f $doc/doc.pdf ]; then cat $doc/doc.pdf else cat $doc/paper.$page.jpg fi ;; esac