From 19eaa466566530de7f712262f21403090258f983 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 17 Mar 2021 18:25:38 +0100 Subject: [PATCH] tools: rudimentary docx importer Add a rudimentary importer for Microsoft Word docx files. It uses pandoc to convert the Word documents to Markdown and extracts the embedded images into static/img/. It's also fixing the paths and names of the images so we can generally use it. If an author, date or description is provided the article meta data is patched to use it. Also a fotogrid section will be created. What's left is cleaning the generated Markdown from Word's image size constraints, I have not found a generally usable way that works without manual intervention. Also resizing the images is not yet done. Signed-off-by: Johannes Thumshirn --- tools/import-docx.sh | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100755 tools/import-docx.sh diff --git a/tools/import-docx.sh b/tools/import-docx.sh new file mode 100755 index 0000000..a70e565 --- /dev/null +++ b/tools/import-docx.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-3.0 +# vim: set sw=4 ts=4 ex + +die() { + echo "$@" + exit +} + +usage() { + die "Usage: $(basename $0) [-a author] [-d date ] [-D description] docx" +} + + +AUTHOR="FIXME" +DATE=$(date +"%d.%m.%Y") +DESCRIPTION="FIXME" + +while getopts "a:d:" opts; do + case $opts in + a) + AUTHOR=$OPTARG + ;; + d) + DATE=$OPTARG + ;; + D) + DESCRIPTION=$OPTARG + ;; + *) + usage + ;; + esac +done + +shift "$((OPTIND - 1))" +DOCX=$1 +if [ $# -lt 1 ]; then + usage +fi + +which pandoc 2>&1 || die "Please install pandoc to use $(basename $0)" + +ARTICLE=${DOCX/.docx/} +MD="$ARTICLE.md" + +TEMPDIR=$(mktemp -d $ARTICLE.XXX) + +pandoc -f docx -t markdown -o $TEMPDIR/$MD --extract-media $TEMPDIR $DOCX + +pushd $TEMPDIR +sed -i "s:$TEMPDIR/media/image:static/img/$ARTICLE:g" $MD +for f in media/*.jpeg; do + mv $f ${f/image/$ARTICLE}; +done +popd + +mv $TEMPDIR/media/*.jpeg static/img/ +mv $TEMPDIR/$MD pages/ + +TMP="$(mktemp $ARTICLE.XXX)" +echo "title: $ARTICLE" >> $TMP +echo "date: $DATE" >> $TMP +echo "author: $AUTHOR" >> $TMP +echo "description: $DESCRIPTION" >> $TMP +echo "" >> $TMP +cat pages/$MD >> $TMP +echo "" >> $TMP +echo "
" >> $TMP +echo "{{ fotogrid([" >> $TMP +for pic in static/img/$ARTICLE*.jpeg; do + echo "\"$pic\"," >> $TMP; +done +echo "]) | safe }}" >> $TMP +mv $TMP pages/$MD + +rm -rf $TEMPDIR