2016-01-21 22:12:19 +01:00
|
|
|
#!/bin/bash
|
|
|
|
# PAPERWEB - GPLv3 licence
|
|
|
|
# Copyright 2016 Yves Gablin
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
# -h This help.
|
2016-01-26 22:51:15 +01:00
|
|
|
#
|
2016-01-21 22:12:19 +01:00
|
|
|
# Query:
|
|
|
|
# -Q [-d <pipe-separated list of YYYYMMDD/YYYYMM/YYYY prefixes to search>]
|
|
|
|
# [-l <pipe-separated list of labels to match (all must be found)>]
|
|
|
|
# [-k <pipe-separated list of keywords to match (all must be found)>]
|
|
|
|
# [-i] : case-insensitive search
|
|
|
|
# At least -d, -l, or -k must be used.
|
|
|
|
# Result (WSN):
|
|
|
|
# OUTPUT = "[" [ DOC_LIST ] "]" .
|
|
|
|
# DOC_LIST = DOC { "," DOC } .
|
|
|
|
# DOC = "{""folder"":" FOLDER ",""labels"":[" [ LABELS ] "],""count"":" COUNT ",""type"":" TYPE "}" .
|
|
|
|
# FOLDER = json_string .
|
|
|
|
# LABELS = json_string { "," json_string } .
|
|
|
|
# COUNT = json_number .
|
|
|
|
# TYPE = """pdf""" | """pages""" .
|
2016-01-26 22:51:15 +01:00
|
|
|
#
|
2016-01-21 22:12:19 +01:00
|
|
|
# Retrieve a document's thumbnails:
|
|
|
|
# -T <date> : the folder-name of the document
|
|
|
|
# Result (WSN):
|
|
|
|
# OUTPUT = "[" THUMBS "]" .
|
|
|
|
# THUMBS = CONTENTS { "," CONTENTS } .
|
|
|
|
# CONTENTS = "{""mime"":" MIME ",""data"":" B64_DATA ",""width"":" WIDTH ",""height"":" HEIGHT "}" .
|
|
|
|
# MIME = json_string .
|
|
|
|
# B64_DATA = json_string .
|
|
|
|
# WIDTH = json_number .
|
|
|
|
# HEIGHT = json_number .
|
2016-01-26 22:51:15 +01:00
|
|
|
#
|
2016-01-21 22:12:19 +01:00
|
|
|
# Retrieve a document's page/PDF and metadata:
|
|
|
|
# -D <date> -p <page number>
|
|
|
|
# Result (WSN):
|
|
|
|
# OUTPUT = CONTENTS .
|
2016-01-26 22:51:15 +01:00
|
|
|
# The "data" field contains the wanted page in JPEG format, except if the
|
|
|
|
# document is a PDF file: then the whole PDF file is encoded.
|
2016-01-21 22:12:19 +01:00
|
|
|
# Width and height should be ignored for PDF contents.
|
2016-01-26 22:51:15 +01:00
|
|
|
#
|
|
|
|
# Retrieve a document's page's metadata without the actual page:
|
|
|
|
# -M <date> -p <page number>
|
|
|
|
# Result (WSN):
|
|
|
|
# OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" .
|
|
|
|
# Width and height should be ignored for PDF contents.
|
|
|
|
#
|
2016-01-21 22:12:19 +01:00
|
|
|
# Retrieve a raw document's page/PDF without metadata:
|
|
|
|
# -R <date> -p <page number>
|
2016-01-26 22:51:15 +01:00
|
|
|
# Result: page contents.
|
|
|
|
# The returned data is the raw page in JPEG format, except if the document is
|
|
|
|
# a PDF file: then the whole PDF file is returned.
|
|
|
|
#
|
|
|
|
# The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm"
|
|
|
|
# are both available. In this case, pages from PDF documents are treated the
|
|
|
|
# same way as pages from image-based documents.
|
2016-01-21 22:12:19 +01:00
|
|
|
|
|
|
|
########## CONFIGURATION ##########
|
|
|
|
|
|
|
|
BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY'
|
2016-01-26 22:51:15 +01:00
|
|
|
PDF_DPI=90
|
2016-01-21 22:12:19 +01:00
|
|
|
|
|
|
|
##### NO CHANGE PAST THIS LINE #####
|
|
|
|
|
|
|
|
# INIT
|
|
|
|
mode=
|
|
|
|
dates=()
|
|
|
|
labels=()
|
|
|
|
words=()
|
|
|
|
q_ci=
|
|
|
|
doc=
|
|
|
|
page=
|
2016-01-26 22:51:15 +01:00
|
|
|
{ type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true
|
2016-01-21 22:12:19 +01:00
|
|
|
|
|
|
|
# READ COMMAND LINE PARAMETERS
|
2016-01-26 22:51:15 +01:00
|
|
|
while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in
|
2016-01-21 22:12:19 +01:00
|
|
|
h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;;
|
|
|
|
Q) mode=Q ;;
|
|
|
|
d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;;
|
|
|
|
l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
|
|
|
|
k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
|
|
|
|
i) q_ci=true ;;
|
2016-01-26 22:51:15 +01:00
|
|
|
T|D|M|R)
|
2016-01-21 22:12:19 +01:00
|
|
|
mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;;
|
|
|
|
p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;;
|
|
|
|
esac; done
|
|
|
|
|
|
|
|
# CHECK COMMAND LINE PARAMETERS
|
|
|
|
case "$mode" in
|
|
|
|
Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done
|
|
|
|
[ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;;
|
|
|
|
T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;;
|
2016-01-26 22:51:15 +01:00
|
|
|
D|M|R)
|
2016-01-21 22:12:19 +01:00
|
|
|
[ -n "$doc" -a -d "$BASE/$doc" ] || exit 3
|
2016-01-26 22:51:15 +01:00
|
|
|
if [ -f "$BASE/$doc/doc.pdf" ]; then
|
|
|
|
if [ -n "$pdfasjpg" ]; then
|
|
|
|
maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}')
|
|
|
|
[ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3
|
|
|
|
fi
|
|
|
|
else
|
|
|
|
[ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3
|
|
|
|
fi ;;
|
2016-01-21 22:12:19 +01:00
|
|
|
*) exit 1 ;;
|
|
|
|
esac
|
|
|
|
|
|
|
|
# RUN
|
2016-01-26 22:51:15 +01:00
|
|
|
|
|
|
|
# &0: raw string
|
|
|
|
# &1: json string
|
2016-01-21 22:12:19 +01:00
|
|
|
function json_string() {
|
|
|
|
printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')"
|
|
|
|
}
|
|
|
|
|
2016-01-26 22:51:15 +01:00
|
|
|
# &0: image data
|
|
|
|
# &1: "<width> <height>"
|
|
|
|
function image_wh() {
|
|
|
|
file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/'
|
|
|
|
}
|
|
|
|
|
2016-01-21 22:12:19 +01:00
|
|
|
cd "$BASE"
|
|
|
|
case "$mode" in
|
|
|
|
Q)
|
|
|
|
found="$(
|
|
|
|
case ${#dates[*]} in
|
|
|
|
0) find . -mindepth 1 -maxdepth 1 -type d -printf '%P/|\n' ;;
|
|
|
|
1) find ${dates[0]}* -maxdepth 0 -type d -printf '%p/|\n' ;;
|
|
|
|
*) eval find {$(IFS=, eval echo '"${dates[*]}"')}\* -maxdepth 0 -type d -printf "'%p/|\\n'" ;;
|
|
|
|
esac 2>/dev/null)"
|
|
|
|
if [ ${#labels[*]} -gt 0 ]; then
|
|
|
|
# This is an AND
|
|
|
|
for l in "${labels[@]}"; do
|
2016-01-21 23:17:09 +01:00
|
|
|
[ -n "$found" ] && found="$(
|
2016-01-21 22:12:19 +01:00
|
|
|
eval grep -lF${q_ci:+i}e "$l" ${found//|/labels} 2>/dev/null \
|
|
|
|
| sed 's#labels$#|#')"
|
|
|
|
done
|
|
|
|
## That would be an OR (for later, perhaps)
|
|
|
|
#found="$(
|
|
|
|
# eval grep -lF${q_ci:+i}f <(printf '%s\n' "${labels[@]}") ${found//|/labels} 2>/dev/null \
|
|
|
|
# | sed 's#labels$#|#')"
|
|
|
|
fi
|
2016-01-21 23:17:09 +01:00
|
|
|
if [ ${#words[*]} -gt 0 ] && [ -n "$found" ]; then
|
2016-01-21 22:12:19 +01:00
|
|
|
# This is an AND
|
|
|
|
tmpf="$(
|
|
|
|
eval grep -Ho "'>[^<>]*</span'" ${found//|/paper.*.words} 2>/dev/null \
|
|
|
|
| sed 's#</span$##;s#<#<#g;s#>#>#g;s#&#\&#g')"
|
|
|
|
for w in "${words[@]}"; do
|
|
|
|
tmpf="$(grep -hF${q_ci:+i}e "$w" <<<"$tmpf")"
|
|
|
|
done
|
|
|
|
found="$(cut -d: -f1 <<<"$tmpf" | sed 's#paper\.[0-9]*\.words$#|#' | sort -u)"
|
|
|
|
## That would be an OR (for later, perhaps)
|
|
|
|
#found="$(
|
|
|
|
# eval grep -Ho "'>[^<>]*</span'" ${found//|/paper.*.words} 2>/dev/null \
|
|
|
|
# | sed 's#</span$##;s#<#<#g;s#>#>#g;s#&#\&#g' \
|
|
|
|
# | grep -hF${q_ci:+i}f <(printf '%s\n' "${words[@]}") \
|
|
|
|
# | cut -d: -f1 \
|
|
|
|
# | sed 's#paper\.[0-9]*\.words$#|#' \
|
|
|
|
# | sort -u)"
|
|
|
|
fi
|
|
|
|
printf '['
|
|
|
|
while IFS=/ read folder nil; do if [ -n "$folder" ]; then
|
|
|
|
[ -f $folder/doc.pdf ] && type=pdf || type=pages
|
|
|
|
count=$(/bin/ls -1 $folder/paper.*.thumb.jpg 2>/dev/null | wc -l)
|
|
|
|
labs="$(
|
|
|
|
while IFS=, read lab nil; do printf ','; json_string <<<"$lab"; done < <(sort -df $folder/labels))"
|
|
|
|
printf ',{"folder":%s,"labels":[%s],"count":%d,"type":"%s"}' \
|
|
|
|
"$(json_string <<<"$folder")" "${labs:1}" $count "$type"
|
|
|
|
fi; done < <(sort -r <<<"$found") | sed 's/^.//'
|
|
|
|
printf ']'
|
|
|
|
;;
|
|
|
|
T)
|
|
|
|
printf '['
|
|
|
|
/bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \
|
|
|
|
| while read t; do
|
2016-01-26 22:51:15 +01:00
|
|
|
read w h < <(image_wh <$t)
|
2016-01-21 22:12:19 +01:00
|
|
|
printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \
|
|
|
|
"$(base64 --wrap=0 "$t" | json_string)" $w $h
|
|
|
|
done | sed 's/^.//'
|
|
|
|
printf ']'
|
|
|
|
;;
|
2016-01-26 22:51:15 +01:00
|
|
|
D|M)
|
2016-01-21 22:12:19 +01:00
|
|
|
if [ -f $doc/doc.pdf ]; then
|
|
|
|
p=$doc/doc.pdf
|
2016-01-26 22:51:15 +01:00
|
|
|
if [ -n "$pdfasjpg" ]; then
|
|
|
|
read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p | image_wh)
|
|
|
|
mime='image/jpeg'
|
|
|
|
cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p)
|
|
|
|
else
|
|
|
|
w=0; h=0; mime='application/pdf'
|
|
|
|
cmd=(cat $p)
|
|
|
|
fi
|
2016-01-21 22:12:19 +01:00
|
|
|
else
|
|
|
|
p=$doc/paper.$page.jpg
|
2016-01-26 22:51:15 +01:00
|
|
|
read w h < <(image_wh <$p)
|
|
|
|
mime='image/jpeg'
|
|
|
|
cmd=(cat $p)
|
|
|
|
fi
|
|
|
|
if [ $mode == D ]; then
|
|
|
|
printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \
|
|
|
|
"$(json_string <<<"$mime")" "$("${cmd[@]}" | base64 --wrap=0 | json_string)" $w $h
|
|
|
|
else
|
|
|
|
printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h
|
2016-01-21 22:12:19 +01:00
|
|
|
fi
|
|
|
|
;;
|
|
|
|
R)
|
|
|
|
if [ -f $doc/doc.pdf ]; then
|
2016-01-26 22:51:15 +01:00
|
|
|
if [ -n "$pdfasjpg" ]; then
|
|
|
|
pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf
|
|
|
|
else
|
|
|
|
cat $doc/doc.pdf
|
|
|
|
fi
|
2016-01-21 22:12:19 +01:00
|
|
|
else
|
|
|
|
cat $doc/paper.$page.jpg
|
|
|
|
fi
|
|
|
|
;;
|
|
|
|
esac
|