A script to turn unencrypted epub files into plain text, bash

September 05, 2024

#!/bin/bash
# Check if a directory is provided
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <path-to-directory>"
exit 1
fi
DIRECTORY="$1"
# Check if the provided argument is a directory
if [ ! -d "$DIRECTORY" ]; then
echo "The specified path is not a directory."
exit 1
fi
# Process each EPUB file in the directory
find "$DIRECTORY" -type f -name "*.epub" | while IFS= read -r EPUB_FILE; do
# Extract the base name without the extension
BASE_NAME=$(basename "$EPUB_FILE" .epub)
TEXT_FILE="$DIRECTORY/${BASE_NAME}.txt"
echo "Processing $EPUB_FILE"

# Create a temporary directory to extract the EPUB file
TEMP_DIR=$(mktemp -d)
echo "Extracting EPUB file to $TEMP_DIR"

# Unzip the EPUB file into the temporary directory
unzip -q "$EPUB_FILE" -d "$TEMP_DIR"

# Find the content directory (OEBPS, EPUB, ops folder, or text subdirectory)
CONTENT_DIR=$(find "$TEMP_DIR" -type d $ -iname "OEBPS" -o -iname "EPUB" -o -iname "ops" $)
if [ -z "$CONTENT_DIR" ]; then
# If the main content directory is not found, check for 'text' subdirectory
CONTENT_DIR=$(find "$TEMP_DIR" -type d -iname "text")
fi

if [ -z "$CONTENT_DIR" ]; then
echo "Content directory not found in $EPUB_FILE!"
rm -rf "$TEMP_DIR"
continue
fi

# Find all HTML/XHTML/XML files and process them
echo "Extracting text content to $TEXT_FILE"
> "$TEXT_FILE" # Empty the output file if it exists

find "$CONTENT_DIR" -type f $ -iname "*.html" -o -iname "*.xhtml" -o -iname "*.xml" $ | while IFS= read -r FILE; do
echo "Processing $FILE"

# Get the filename without the path
FILE_NAME=$(basename "$FILE")

# Append the filename as a heading to the output text file
echo -e "\n\n$FILE_NAME\n" >> "$TEXT_FILE"

# Extract text content from the HTML/XHTML/XML file
sed -n 's/<[^>]*>//gp' "$FILE" | grep -v '^[[:space:]]*$' >> "$TEXT_FILE"
done

# Verify if the file was written correctly
if [ ! -s "$TEXT_FILE" ]; then
echo "Warning: $TEXT_FILE is empty or not created correctly."
fi

# Clean up
rm -rf "$TEMP_DIR"

echo "Text extraction complete for $EPUB_FILE. Output saved to $TEXT_FILE"
done

Search This Blog

John's Linux blog

A script to turn unencrypted epub files into plain text, bash

Popular posts from this blog

Automatically Fix Song Metadata and Filenames on Linux with Beets

throttle traffic on apache

Enable Anydesk on Linux