Better handling of metadata, individual PDF pages
parent
18866c3157
commit
d1c5cae9f8
|
@ -13,6 +13,7 @@
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
#
|
#
|
||||||
# -h This help.
|
# -h This help.
|
||||||
|
#
|
||||||
# Query:
|
# Query:
|
||||||
# -Q [-d <pipe-separated list of YYYYMMDD/YYYYMM/YYYY prefixes to search>]
|
# -Q [-d <pipe-separated list of YYYYMMDD/YYYYMM/YYYY prefixes to search>]
|
||||||
# [-l <pipe-separated list of labels to match (all must be found)>]
|
# [-l <pipe-separated list of labels to match (all must be found)>]
|
||||||
|
@ -27,6 +28,7 @@
|
||||||
# LABELS = json_string { "," json_string } .
|
# LABELS = json_string { "," json_string } .
|
||||||
# COUNT = json_number .
|
# COUNT = json_number .
|
||||||
# TYPE = """pdf""" | """pages""" .
|
# TYPE = """pdf""" | """pages""" .
|
||||||
|
#
|
||||||
# Retrieve a document's thumbnails:
|
# Retrieve a document's thumbnails:
|
||||||
# -T <date> : the folder-name of the document
|
# -T <date> : the folder-name of the document
|
||||||
# Result (WSN):
|
# Result (WSN):
|
||||||
|
@ -37,18 +39,35 @@
|
||||||
# B64_DATA = json_string .
|
# B64_DATA = json_string .
|
||||||
# WIDTH = json_number .
|
# WIDTH = json_number .
|
||||||
# HEIGHT = json_number .
|
# HEIGHT = json_number .
|
||||||
|
#
|
||||||
# Retrieve a document's page/PDF and metadata:
|
# Retrieve a document's page/PDF and metadata:
|
||||||
# -D <date> -p <page number>
|
# -D <date> -p <page number>
|
||||||
# Result (WSN):
|
# Result (WSN):
|
||||||
# OUTPUT = CONTENTS .
|
# OUTPUT = CONTENTS .
|
||||||
|
# The "data" field contains the wanted page in JPEG format, except if the
|
||||||
|
# document is a PDF file: then the whole PDF file is encoded.
|
||||||
# Width and height should be ignored for PDF contents.
|
# Width and height should be ignored for PDF contents.
|
||||||
|
#
|
||||||
|
# Retrieve a document's page's metadata without the actual page:
|
||||||
|
# -M <date> -p <page number>
|
||||||
|
# Result (WSN):
|
||||||
|
# OUTPUT = "{""mime"":" MIME ",""width"":" WIDTH ",""height"":" HEIGHT "}" .
|
||||||
|
# Width and height should be ignored for PDF contents.
|
||||||
|
#
|
||||||
# Retrieve a raw document's page/PDF without metadata:
|
# Retrieve a raw document's page/PDF without metadata:
|
||||||
# -R <date> -p <page number>
|
# -R <date> -p <page number>
|
||||||
# Result: file contents.
|
# Result: page contents.
|
||||||
|
# The returned data is the raw page in JPEG format, except if the document is
|
||||||
|
# a PDF file: then the whole PDF file is returned.
|
||||||
|
#
|
||||||
|
# The behaviour exposed above changes if the commands "pdfinfo" and "pdftoppm"
|
||||||
|
# are both available. In this case, pages from PDF documents are treated the
|
||||||
|
# same way as pages from image-based documents.
|
||||||
|
|
||||||
########## CONFIGURATION ##########
|
########## CONFIGURATION ##########
|
||||||
|
|
||||||
BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY'
|
BASE='/PATH/TO/PAPERWORK/BASE/DIRECTORY'
|
||||||
|
PDF_DPI=90
|
||||||
|
|
||||||
##### NO CHANGE PAST THIS LINE #####
|
##### NO CHANGE PAST THIS LINE #####
|
||||||
|
|
||||||
|
@ -60,16 +79,17 @@ words=()
|
||||||
q_ci=
|
q_ci=
|
||||||
doc=
|
doc=
|
||||||
page=
|
page=
|
||||||
|
{ type pdfinfo && type pdftoppm; } &>/dev/null && pdfasjpg=true
|
||||||
|
|
||||||
# READ COMMAND LINE PARAMETERS
|
# READ COMMAND LINE PARAMETERS
|
||||||
while getopts hQd:l:k:iT:D:p:R: opt; do case "$opt" in
|
while getopts hQd:l:k:iT:D:p:M:R: opt; do case "$opt" in
|
||||||
h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;;
|
h) sed -n '2,/^$/s/.//p' "$0"; exit 0 ;;
|
||||||
Q) mode=Q ;;
|
Q) mode=Q ;;
|
||||||
d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;;
|
d) IFS='|' read -a dates < <(tr -dc '|[:digit:]' <<<"$OPTARG") ;;
|
||||||
l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
|
l) IFS='|' read -a labels < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
|
||||||
k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
|
k) IFS='|' read -a words < <(tr -d ',"[:cntrl:]' <<<"$OPTARG") ;;
|
||||||
i) q_ci=true ;;
|
i) q_ci=true ;;
|
||||||
T|D|R)
|
T|D|M|R)
|
||||||
mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;;
|
mode=$opt; doc=$(tr -dc '[:digit:]_' <<<"$OPTARG") ;;
|
||||||
p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;;
|
p) page=$(tr -dc '[:digit:]' <<<"$OPTARG") ;;
|
||||||
esac; done
|
esac; done
|
||||||
|
@ -79,17 +99,33 @@ case "$mode" in
|
||||||
Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done
|
Q) for ((i=${#dates[*]}-1;i>=0;i--)); do [ ${#dates[i]} -ge 4 ] || unset dates[i]; done
|
||||||
[ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;;
|
[ -n "${dates[*]}${labels[*]}${words[*]}" ] || exit 2 ;;
|
||||||
T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;;
|
T) [ -n "$doc" -a -d "$BASE/$doc" ] || exit 3 ;;
|
||||||
D|R)
|
D|M|R)
|
||||||
[ -n "$doc" -a -d "$BASE/$doc" ] || exit 3
|
[ -n "$doc" -a -d "$BASE/$doc" ] || exit 3
|
||||||
[ -f "$BASE/$doc/doc.pdf" -o -f "$BASE/$doc/paper.$page.jpg" ] || exit 3 ;;
|
if [ -f "$BASE/$doc/doc.pdf" ]; then
|
||||||
|
if [ -n "$pdfasjpg" ]; then
|
||||||
|
maxp=$(pdfinfo "$BASE/$doc/doc.pdf" | awk '/^Pages:/{print $2}')
|
||||||
|
[ -n "$maxp" -a -n "$page" -a $page -gt 0 -a $page -le $maxp ] || exit 3
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
[ -f "$BASE/$doc/paper.$page.jpg" ] || exit 3
|
||||||
|
fi ;;
|
||||||
*) exit 1 ;;
|
*) exit 1 ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# RUN
|
# RUN
|
||||||
|
|
||||||
|
# &0: raw string
|
||||||
|
# &1: json string
|
||||||
function json_string() {
|
function json_string() {
|
||||||
printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')"
|
printf '"%s"' "$(sed 's#[\\/"]#\\&#g;s#\t#\\t#g')"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# &0: image data
|
||||||
|
# &1: "<width> <height>"
|
||||||
|
function image_wh() {
|
||||||
|
file -b - | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/'
|
||||||
|
}
|
||||||
|
|
||||||
cd "$BASE"
|
cd "$BASE"
|
||||||
case "$mode" in
|
case "$mode" in
|
||||||
Q)
|
Q)
|
||||||
|
@ -144,27 +180,43 @@ T)
|
||||||
printf '['
|
printf '['
|
||||||
/bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \
|
/bin/ls -1 $doc/paper.*.thumb.jpg | sort -t. -k2,2n \
|
||||||
| while read t; do
|
| while read t; do
|
||||||
read w h < <(file -b "$t" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/')
|
read w h < <(image_wh <$t)
|
||||||
printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \
|
printf ',{"mime":"image\/jpeg","data":%s,"width":%d,"height":%d}' \
|
||||||
"$(base64 --wrap=0 "$t" | json_string)" $w $h
|
"$(base64 --wrap=0 "$t" | json_string)" $w $h
|
||||||
done | sed 's/^.//'
|
done | sed 's/^.//'
|
||||||
printf ']'
|
printf ']'
|
||||||
;;
|
;;
|
||||||
D)
|
D|M)
|
||||||
if [ -f $doc/doc.pdf ]; then
|
if [ -f $doc/doc.pdf ]; then
|
||||||
p=$doc/doc.pdf
|
p=$doc/doc.pdf
|
||||||
w=0; h=0
|
if [ -n "$pdfasjpg" ]; then
|
||||||
|
read w h < <(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p | image_wh)
|
||||||
|
mime='image/jpeg'
|
||||||
|
cmd=(pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $p)
|
||||||
|
else
|
||||||
|
w=0; h=0; mime='application/pdf'
|
||||||
|
cmd=(cat $p)
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
p=$doc/paper.$page.jpg
|
p=$doc/paper.$page.jpg
|
||||||
read w h < <(file -b "$p" | sed -r 's/.*, ([0-9]+)x([0-9]+),[^,]*$/\1 \2/')
|
read w h < <(image_wh <$p)
|
||||||
|
mime='image/jpeg'
|
||||||
|
cmd=(cat $p)
|
||||||
|
fi
|
||||||
|
if [ $mode == D ]; then
|
||||||
|
printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \
|
||||||
|
"$(json_string <<<"$mime")" "$("${cmd[@]}" | base64 --wrap=0 | json_string)" $w $h
|
||||||
|
else
|
||||||
|
printf '{"mime":%s,"width":%d,"height":%d}' "$(json_string <<<"$mime")" $w $h
|
||||||
fi
|
fi
|
||||||
mime=$(file -bi $p | cut -d';' -f1)
|
|
||||||
printf '{"mime":%s,"data":%s,"width":%d,"height":%d}' \
|
|
||||||
"$(json_string <<<"$mime")" "$(base64 --wrap=0 "$p" | json_string)" $w $h
|
|
||||||
;;
|
;;
|
||||||
R)
|
R)
|
||||||
if [ -f $doc/doc.pdf ]; then
|
if [ -f $doc/doc.pdf ]; then
|
||||||
cat $doc/doc.pdf
|
if [ -n "$pdfasjpg" ]; then
|
||||||
|
pdftoppm -r $PDF_DPI -jpeg -f $page -l $page $doc/doc.pdf
|
||||||
|
else
|
||||||
|
cat $doc/doc.pdf
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
cat $doc/paper.$page.jpg
|
cat $doc/paper.$page.jpg
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -20,11 +20,16 @@ $USER='USER THAT SUDO WILL RUN paperfind.sh AS';
|
||||||
##### NO CHANGE PAST THIS LINE #####
|
##### NO CHANGE PAST THIS LINE #####
|
||||||
|
|
||||||
if (array_key_exists('doDownload', $_REQUEST)) {
|
if (array_key_exists('doDownload', $_REQUEST)) {
|
||||||
$mime = (@$_REQUEST['type'] == 'pdf' ? 'application/pdf' : 'image/jpeg');
|
|
||||||
$date = escapeshellarg(@$_REQUEST['date']);
|
$date = escapeshellarg(@$_REQUEST['date']);
|
||||||
$page = escapeshellarg(@$_REQUEST['page']);
|
$page = escapeshellarg(@$_REQUEST['page']);
|
||||||
header("Content-Type: {$mime}");
|
|
||||||
passthru("sudo -u {$USER} {$PATH} -R {$date} -p {$page}");
|
# -M and -R are used instead of -D to avoid storing the data in RAM
|
||||||
|
$json = exec("sudo -u {$USER} {$PATH} -M {$date} -p {$page}");
|
||||||
|
if ($json) {
|
||||||
|
$meta = json_decode($json, true);
|
||||||
|
header("Content-Type: {$meta['mime']}");
|
||||||
|
passthru("sudo -u {$USER} {$PATH} -R {$date} -p {$page}");
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
?>
|
?>
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
|
@ -73,10 +78,9 @@ if (array_key_exists('doDownload', $_REQUEST)) {
|
||||||
<input type="hidden" name="queryDone" value="<?php echo htmlentities($json); ?>"/>
|
<input type="hidden" name="queryDone" value="<?php echo htmlentities($json); ?>"/>
|
||||||
<?php
|
<?php
|
||||||
foreach (json_decode($json, true) as $doc) {
|
foreach (json_decode($json, true) as $doc) {
|
||||||
$docId = $doc['folder'].'|'.$doc['type'];
|
|
||||||
?>
|
?>
|
||||||
<button type="submit" name="doThumbnails" value="<?php echo htmlentities($docId); ?>"<?php
|
<button type="submit" name="doThumbnails" value="<?php echo htmlentities($doc['folder']); ?>"<?php
|
||||||
if ($docId == $current) {
|
if ($doc['folder'] == $current) {
|
||||||
echo ' disabled="disabled"';
|
echo ' disabled="disabled"';
|
||||||
}
|
}
|
||||||
?>>
|
?>>
|
||||||
|
@ -99,27 +103,23 @@ if (array_key_exists('doDownload', $_REQUEST)) {
|
||||||
<section id="thumbs">
|
<section id="thumbs">
|
||||||
<?php
|
<?php
|
||||||
if (array_key_exists('doThumbnails', $_REQUEST)) {
|
if (array_key_exists('doThumbnails', $_REQUEST)) {
|
||||||
$do = explode('|', $_REQUEST['doThumbnails']);
|
$date = $_REQUEST['doThumbnails'];
|
||||||
$date = $do[0];
|
|
||||||
$type = $do[1];
|
|
||||||
$datearg = escapeshellarg($date);
|
$datearg = escapeshellarg($date);
|
||||||
$json = exec("sudo -u {$USER} {$PATH} -T {$datearg}");
|
$json = exec("sudo -u {$USER} {$PATH} -T {$datearg}");
|
||||||
} else {
|
} else {
|
||||||
$json = @$_REQUEST['thumbnailsDone'];
|
$json = @$_REQUEST['thumbnailsDone'];
|
||||||
$do = explode('|', @$_REQUEST['currentDoc']);
|
$date = @$_REQUEST['currentDoc'];
|
||||||
$date = @$do[0];
|
|
||||||
$type = @$do[1];
|
|
||||||
}
|
}
|
||||||
if ($json) {
|
if ($json) {
|
||||||
?>
|
?>
|
||||||
<h2>Pages</h2>
|
<h2>Pages</h2>
|
||||||
<input type="hidden" name="thumbnailsDone" value="<?php echo htmlentities($json); ?>"/>
|
<input type="hidden" name="thumbnailsDone" value="<?php echo htmlentities($json); ?>"/>
|
||||||
<input type="hidden" name="currentDoc" value="<?php echo htmlentities($date.'|'.$type); ?>"/>
|
<input type="hidden" name="currentDoc" value="<?php echo htmlentities($date); ?>"/>
|
||||||
<?php
|
<?php
|
||||||
foreach (json_decode($json, true) as $n => $p) {
|
foreach (json_decode($json, true) as $n => $p) {
|
||||||
$nump = $n+1;
|
$nump = $n+1;
|
||||||
?>
|
?>
|
||||||
<a target="_blank" href="?<?php echo htmlentities("doDownload=1&type={$type}&date={$date}&page={$nump}"); ?>"><img
|
<a target="_blank" href="?<?php echo htmlentities("doDownload=1&date={$date}&page={$nump}"); ?>"><img
|
||||||
src="data:<?php echo $p['mime']; ?>;base64,<?php echo $p['data']; ?>"
|
src="data:<?php echo $p['mime']; ?>;base64,<?php echo $p['data']; ?>"
|
||||||
width="<?php echo $p['width']; ?>" height="<?php echo $p['height']; ?>"
|
width="<?php echo $p['width']; ?>" height="<?php echo $p['height']; ?>"
|
||||||
alt="Page <?php echo $nump; ?>"/></a>
|
alt="Page <?php echo $nump; ?>"/></a>
|
||||||
|
|
Loading…
Reference in New Issue