Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paperingest2 #302

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions folderupgrade
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash
#script to remove the submissionDocumentation folder
#calling mmfunctions
SCRIPTDIR=$(dirname $(which "${0}"))
. "${SCRIPTDIR}/mmfunctions" || { echo "Missing '${SCRIPTDIR}/mmfunctions'. Exiting." ; exit 1 ;};
while [ "${*}" != "" ] ; do
echo "removing the submissionsDocumentation folder"
INPUTFILE="${1}"
echo "The input is ${INPUTFILE}"
SUBDOC="${INPUTFILE}/metadata/submissionDocumentation/"
METADOC="${INPUTFILE}/metadata/"
OBJECTDOC="${INPUTFILE}/objects/Preservation/"
shift
"${SCRIPTDIR}/removeDSStore" "${INPUTFILE}"
#remove unnecessary directory
if [ -d "${SUBDOC}" ] ; then
mv -v -n "${SUBDOC}"* "${METADOC}"
#mv "${SUBDOC}"*
echo "going to delete for realsies"
rmdir "${SUBDOC}"
fi

for entry in "${OBJECTDOC}"* ; do
echo "here is file $entry"
if [ -d "$entry" ] ; then
cd "$entry"
mv -v -n * .[^.]* "${INPUTFILE}/objects/"

echo "deleting unnecessary folders"

rmdir "$entry"

#moving images folder to metadata and renaming
mkdir -p ${METADOC}depictions/ ; mv -n "${INPUTFILE}/objects/Image"* "${METADOC}depictions/object_photos"

#removing extra Image folder
if [ -d "${METADOC}depictions/object_photos/Image" ] ; then
cd "${METADOC}depictions/object_photos/Image"
mv -v -n * .[^.]* ..
echo "deleting extra Image folder"
rmdir "${METADOC}depictions/object_photos/Image"
fi


#removing Preservation folder
mv -n "${INPUTFILE}/objects/Preservation Master/"* "${INPUTFILE}/objects/"
echo "removing empty preservation folder"
rmdir "${INPUTFILE}/objects/Preservation Master"
rmdir "${OBJECTDOC}"

#renaming restoration/access folder
if [ -d "${INPUTFILE}/objects/Restoration" ] ; then
mv -v "${INPUTFILE}/objects/Restoration" "${INPUTFILE}/objects/restoration"
fi

#renaming access folder
if [ -d "${INPUTFILE}/objects/Access/" ] ; then
echo "moving to restoration folder"
mkdir -p ${INPUTFILE}/objects/restoration/
echo "created restoration folder"
for object in "${INPUTFILE}/objects/Access/" ; do
echo "here is $object"
cd $object
mv -n * .[^.]* "${INPUTFILE}/objects/restoration/"
done
rmdir "${INPUTFILE}/objects/Access"
fi
fi
done
done
#checksum verification
cd ${INPUTFILE}/objects &&
for file in *; do
if [[ -f "$file" ]] ; then
echo "file is $file.md5"
NEWMD5="$(md5 -q "$file")"
OLDMD5="$(cat ${METADOC}${file}.md5)"
fi
done
if [[ "${NEWMD5}" = "${OLDMD5}" ]] ; then
echo "checksums match"
fi
33 changes: 22 additions & 11 deletions makepdf
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ while [ "${*}" != "" ] ; do
OUTPUTDIR="${OUTPUTDIR_FORCED}"
LOGDIR="${OUTPUTDIR}/logs"
fi
INGESTLOG="${LOGDIR}/capture.log"
OUTPUTDIRTEXT="${INPUT}/objects/access/txt_1"
_run mkdir -p "${LOGDIR}"
exec > >(tee "${LOGDIR}/$(basename "${0}")_$(_get_iso8601_c)_$(basename "${0}")_${VERSION}.txt")
Expand All @@ -75,7 +76,7 @@ while [ "${*}" != "" ] ; do
TMP_JPG_DIR="${TMP_MAKEPDF_DIR}/jpgs"
_run mkdir -p "${TMP_MAKEPDF_DIR}" "${TMP_JPG_DIR}" "${OUTPUTDIRTEXT}"

for TIF in $(find "${SOURCEDIR}" -maxdepth 1 -mindepth 1 -iname "*.tif" -type f | sort) ; do
for TIF in $(find "${SOURCEDIR}" -maxdepth 1 -mindepth 1 \( -iname "*.tif" -o -iname "*.tiff" \) -type f | sort) ; do
tifname="$(basename "${TIF}")"
_report -dt "Working on ${tifname}..."
pageno="$(echo "${tifname}" | cut -d_ -f2 | cut -d. -f1)"
Expand All @@ -95,20 +96,30 @@ while [ "${*}" != "" ] ; do
if [[ ! -s "${JPG_NAME}" ]] ; then
ffmpeg -hide_banner -nostdin -i "${TIF}" -pix_fmt yuvj420p -s 1275x1650 "${JPG_NAME}"
fi
TESSERACT_CONFIG=(-c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^\&*(){}[]\|\"':;?/>.<,~\` " -c textord_min_linesize=2.25 -c preserve_interword_spaces=1)

if [[ -f "$INGESTLOG" ]] ; then
DOCTYPE=$(_readingestlog "doctype")
fi

if [[ "${DOCTYPE}" == "t" ]] ; then
TESSERACT_CONFIG=(-c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^\&*(){}[]\|\"':;?/>.<,~\` " -c textord_min_linesize=2.25 -c preserve_interword_spaces=1)
_report -dt "ATTENTION: Character whitelist will be used for reading."
elif [[ "${DOCTYPE}" == "c" ]] ; then
TESSERACT_CONFIG=(-c textord_min_linesize=2.25 -c preserve_interword_spaces=1)
fi
tesseract "${JPG_NAME}" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${TESSERACT_CONFIG[@]}" pdf
tesseract "${JPG_NAME}" "${TMP_JPG_DIR}/${TIF_BASE_NAME}" -l eng --psm 4 "${TESSERACT_CONFIG[@]}" txt
done

_report -dt "Checking for PBCore data"
SCRIPT_TITLE=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Series']" -o ": " -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Episode']")
if [[ -n "${SCRIPT_TITLE}" ]] ; then
MIDDLE_OPTIONS+=(--pdftitle "${SCRIPT_TITLE}")
fi
SCRIPT_AUTHOR=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -m "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreCreator" -v "p:creatorRole" -o ": " -v "p:creator" -o " ; ")
if [[ -n "${SCRIPT_AUTHOR}" ]] ; then
MIDDLE_OPTIONS+=(--pdfauthor "${SCRIPT_AUTHOR}")
fi
#_report -dt "Checking for PBCore data"
#SCRIPT_TITLE=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Series']" -o ": " -v "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreTitle[@titleType='Episode']")
#if [[ -n "${SCRIPT_TITLE}" ]] ; then
# MIDDLE_OPTIONS+=(--pdftitle "${SCRIPT_TITLE}")
#fi
#SCRIPT_AUTHOR=$(fmpbcore "${MEDIAID}" | xmlstarlet 'select' -N "p=http://www.pbcore.org/PBCore/PBCoreNamespace.html" -t -m "/p:pbcoreCollection/p:pbcoreDescriptionDocument/p:pbcoreCreator" -v "p:creatorRole" -o ": " -v "p:creator" -o " ; ")
#if [[ -n "${SCRIPT_AUTHOR}" ]] ; then
# MIDDLE_OPTIONS+=(--pdfauthor "${SCRIPT_AUTHOR}")
#fi

pdfjoin "${MIDDLE_OPTIONS[@]}" --pdfkeywords "${MEDIAID}" --fitpaper 'false' --rotateoversize 'false' --paper letter $(find "${TMP_JPG_DIR}" -name "*.pdf" | sort | xargs) --outfile "${OUTPUT}"

Expand Down
183 changes: 174 additions & 9 deletions paperingest
Original file line number Diff line number Diff line change
@@ -1,21 +1,76 @@
#!/bin/bash
REQUIRECONFIG="Y"

SCRIPTDIR=$(dirname "${0}")
SCRIPTDIR="$(dirname "${0}")"
. "${SCRIPTDIR}/mmfunctions" || { echo "Missing '${SCRIPTDIR}/mmfunctions'. Exiting." ; exit 1 ;};

_cleanup(){
_log -a "Process aborted"
exit 1
}

_ask_scantype(){
if [ -z "${SCANTYPE}" ] ; then
_report -qn "Enter 'a' to use ADF or 'f' to use flatbed or 'q' to quit: "
read -e SCANTYPE
[ -z "${SCANTYPE}" ] && _ask_scantype
[[ "${SCANTYPE}" = "q" ]] && exit 0
fi
}

_ask_doubleside(){
if [ -z "${DOUBLESIDE}" ] ; then
_report -qn "Enter 1 if scanning only front or 2 if scanning both front and back or 'q' to quit: "
read -e DOUBLESIDE
[ -z "${DOUBLESIDE}" ] && _ask_doubleside
[[ "${DOUBLESIDE}" = "q" ]] && exit 0
fi
}

_ask_doctype(){
if [ -z "${DOCTYPE_ANSWER}" ] ; then
_report -qn "If document is from typewriter, enter 't', if from computer, enter 'c', if handwritten, enter 'h', if other, enter 'o': "
read -e DOCTYPE_ANSWER
[ -z "${DOCTYPE_ANSWER}" ] && _ask_doctype
[[ "${DOCTYPE_ANSWER}" = "q" ]] && exit 0

if [[ "${DOCTYPE_ANSWER}" = "t" ]] ; then
DOCTYPE="typewriter"
elif [[ "${DOCTYPE_ANSWER}" = "c" ]] ; then
DOCTYPE="computer"
elif [[ "${DOCTYPE_ANSWER}" = "h" ]] ; then
DOCTYPE="handwritten"
elif [[ "${DOCTYPE_ANSWER}" = "o" ]] ; then
DOCTYPE="other"
else
_report -w "You said ${DOCTYPE_ANSWER} which is not valid."
fi
fi
}
trap _cleanup SIGHUP SIGINT SIGTERM
_log -b

_ask_operator

_ask_mediaid

_ask_scantype
_ask_doctype

MIDDLE_OPTIONS+=(--rgb --bits 8 --resolution 600 --auto-length --paper-width 10200 --paper-height 13200 --rotate-n-n --left 0 --width 10200 --top 0 --height 13200 --double-feed n --tiff --no-jpeg --images-per-file 1 --compress zlib)

_scan_page(){
echo "Running: fscanx ${MIDDLE_OPTIONS[@]} ${ORIGDIR}/${MEDIAID}_${COUNTER}.tif" >> "${LOGDIR}/fscanx_process.txt"
fscanx "${MIDDLE_OPTIONS[@]}" "${ORIGDIR}/${MEDIAID}_${COUNTER}.tif" | tee -a "${LOGDIR}/fscanx_process.txt"
=======
COMMAND_OPTIONS+=(--rgb --bits 8 --resolution 600 --auto-length --paper-width 10200 --paper-height 13200 --rotate-n-n --left 0 --width 10200 --top 0 --height 13200 --double-feed n --tiff --no-jpeg --images-per-file 1 --compress zlib)

_scan_page(){
echo "Running: fscanx ${MIDDLE_OPTIONS[@]} ${COMMAND_OPTIONS[@]} ${ORIGDIR}/${MEDIAID}_scan${COUNTER}_${SCANTYPE_ANSWER}_.tif" >> "${LOGDIR}/fscanx_process.txt"
fscanx "${MIDDLE_OPTIONS[@]}" "${COMMAND_OPTIONS[@]}" "${ORIGDIR}/${MEDIAID}_scan${COUNTER}_${SCANTYPE_ANSWER}_.tif" | tee -a "${LOGDIR}/fscanx_process.txt"
>>>>>>> Stashed changes

}
if [ -d "${OUTDIR_PAPER}/${MEDIAID}" ] ; then
_report -wdt "It looks like this ${MEDIAID} was already scanned. If you want to overwrite the existing one please delete ${MEDIAID} first and then try again."
exit
Expand All @@ -29,28 +84,138 @@ LOGDIR="${OUTDIR_PAPER}/${MEDIAID}/metadata/logs"
mkdir -p "${ORIGDIR}"
mkdir -p "${LOGDIR}"

_file_rename_flatbed(){
for file in ${ORIGDIR}/*1.tif ; do mv -v -n "${file}" "${file//1.tif/.tiff}" ; done
for file in ${ORIGDIR}/*1.tif ; do mv -v -n "${file}" "${file//1.tif/${COUNTER}.tiff}" ; done

}

_file_rename_adf(){
for file in ${ORIGDIR}/*.tif ; do mv -v -n "${file}" "${file//.tif/.tiff}" ; done

}
START=$(date -u "+%Y%m%dT%H%M%SZ")

COMMAND="fscanx --adf --rgb --bits 8 --resolution 600 --paper-width 10200 --paper-height 13200 --rotate-n-n --left 0 --width 10200 --top 0 --height 13200 --double-feed n --tiff --no-jpeg --images-per-file 1 --compress zlib '${ORIGDIR}/${MEDIAID}_.tif'"
exec &> "${LOGDIR}/fscanx_process.txt"
eval "${COMMAND}"
if [[ "${SCANTYPE}" == "a" ]] ; then
MIDDLE_OPTIONS+=(--adf)
SCANTYPE_ANSWER="ADF"
_scan_page_adf() {
MIDDLE_OPTIONS=(--adf)
SCANTYPE_ANSWER="ADF"
_ask_doctype
_ask_doubleside
if [[ "${DOUBLESIDE}" == 2 ]] ; then
MIDDLE_OPTIONS+=(--duplex)
elif [[ "${DOUBLESIDE}" == 1 ]] ; then
break
:
>>>>>>> Stashed changes
else
_report -w "You said ${DOUBLESIDE} for the number of pages which is not valid, use 1 or 2."
fi
_scan_page
_file_rename_adf
<<<<<<< Updated upstream
elif [[ "${SCANTYPE}" == "f" ]] ; then
MIDDLE_OPTIONS+=(--flatbed)
SCANTYPE_ANSWER="flatbed"
COUNTER=1
_report -d -n "Hit enter to scan a page or q to stop scanning pages: "
read PAGE_ANSWER

while [[ ! "${PAGE_ANSWER}" = "q" ]] ; do
_scan_page
_file_rename_flatbed
((COUNTER++))
_report -d -n "Hit enter to scan a page or q to stop scanning pages (next page is ${COUNTER}): "
read PAGE_ANSWER
done

else
_report -w "You said ${SCANTYPE} for the scantype which is not valid, use 'a' or 'f'."
exit 1
fi
=======
DOUBLESIDE=""
DOCTYPE_ANSWER=""
}

_scan_page_flatbed(){
MIDDLE_OPTIONS=(--flatbed)
SCANTYPE_ANSWER="flatbed"
#((COUNTER++))
_report -d -n "Hit enter to scan a page or q to stop scanning pages: "
read PAGE_ANSWER

while [[ "${PAGE_ANSWER}" != "q" && "${PAGE_ANSWER}" != "a" ]] ; do
_ask_doctype
_scan_page
_file_rename_flatbed
((COUNTER++))
_report -d -n "Hit enter to scan a page, a to change the scanner or q to stop scanning pages (next page is ${COUNTER}): "
read PAGE_ANSWER
if [[ "${PAGE_ANSWER}" == "a" ]] ; then
SCANTYPE="a"
echo "scantype is ${SCANTYPE} now."
fi
DOCTYPE_ANSWER=""
#echo "doctype is ${DOCTYPE_ANSWER}"
#_ask_doctype
done
}



COUNTER=1
if [[ "${SCANTYPE}" == "f" ]] ; then
_scan_page_flatbed
fi

if [[ "${SCANTYPE}" == "a" ]] ; then
#((COUNTER++))
while [[ "${PAGE_ANSWER}" != "q" ]] ; do
_scan_page_adf
#echo "duplex is ${DOUBLESIDE}"
((COUNTER++))
_report -d -n "Hit a to continue scanning, f to change the scanner or q to stop scanning pages: "
read PAGE_ANSWER
if [[ "${PAGE_ANSWER}" == "f" ]] ; then
SCANTYPE="f"
echo "scantype is ${SCANTYPE} now."
_scan_page_flatbed
fi
done
fi

if [[ "${SCANTYPE}" != "a" && "${SCANTYPE}" != "f" && "${SCANTYPE}" != "q" ]] ; then
_report -w "You said ${SCANTYPE} for the scantype which is not valid, use 'a' or 'f'."
exit 1
fi



>>>>>>> Stashed changes

FIRST=$(find "${ORIGDIR}" -type f -mindepth 1 -maxdepth 1 ! -name ".*" -exec ls -1rt '{}' \; | head -n 1)
LAST=$(ls -1t "${ORIGDIR}" | head -n 1)
open -a /Applications/Preview.app/ "${FIRST}" "${ORIGDIR}/${LAST}"
open "${FIRST}" "${ORIGDIR}/${LAST}"
END=$(date -u "+%Y%m%dT%H%M%SZ")
SYSTEM_DATA=$(system_profiler SPHardwareDataType)
#These retrieved the right info on the Mac I'm using, but I don't know how standard the output is
SERIAL_NUMBER=$(echo "${SYSTEM_DATA}" | grep "Serial Number" | awk '{ print $4 }')
MODEL=$(echo "${SYSTEM_DATA}" | grep "Model Identifier" | awk '{ print $3; }')
OS=$(system_profiler SPSoftwareDataType | grep "System Version" | awk '{ print substr(${0}, index(${0},$3)); }')
SERIAL_NUMBER="$(echo "${SYSTEM_DATA}" | grep "Serial Number" | awk '{ print $4 }')"
MODEL="$(echo "${SYSTEM_DATA}" | grep "Model Identifier" | awk '{ print $3; }')"
OS="$(system_profiler SPSoftwareDataType | grep "System Version" | cut -d ":" -f 2- | awk '{$1=$1;print}')"
echo "datetime_start: ${START}" >> "${LOGDIR}/capture.log"
echo "datetime_end: ${END}" >> "${LOGDIR}/capture.log"
echo "serial number: ${SERIAL_NUMBER}" >> "${LOGDIR}/capture.log"
echo "model id: ${MODEL}" >> "${LOGDIR}/capture.log"
echo "os: ${OS}" >> "${LOGDIR}/capture.log"
echo "identifier: ${MEDIAID}" >> "${LOGDIR}/capture.log"
echo "operator: ${OP}" >> "${LOGDIR}/capture.log"
echo "command: ${COMMAND}" >> "${LOGDIR}/capture.log"
echo "scantype: ${SCANTYPE_ANSWER}" >> "${LOGDIR}/capture.log"
echo "doctype: ${DOCTYPE}" >> "${LOGDIR}/capture.log"
echo "fscanx_options: ${MIDDLE_OPTIONS[@]}" >> "${LOGDIR}/capture.log"
echo "fscanx_options: ${MIDDLE_OPTIONS[@]} ${COMMAND_OPTIONS[@]}" >> "${LOGDIR}/capture.log"

echo done scanning "${MEDIAID}"

Expand Down