A script to turn unencrypted epub files into plain text, bash
#!/bin/bash
# Check if a directory is provided
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <path-to-directory>"
exit 1
fi
DIRECTORY="$1"
# Check if the provided argument is a directory
if [ ! -d "$DIRECTORY" ]; then
echo "The specified path is not a directory."
exit 1
fi
# Process each EPUB file in the directory
find "$DIRECTORY" -type f -name "*.epub" | while IFS= read -r EPUB_FILE; do
# Extract the base name without the extension
BASE_NAME=$(basename "$EPUB_FILE" .epub)
TEXT_FILE="$DIRECTORY/${BASE_NAME}.txt"
echo "Processing $EPUB_FILE"
# Create a temporary directory to extract the EPUB file
TEMP_DIR=$(mktemp -d)
echo "Extracting EPUB file to $TEMP_DIR"
# Unzip the EPUB file into the temporary directory
unzip -q "$EPUB_FILE" -d "$TEMP_DIR"
# Find the content directory (OEBPS, EPUB, ops folder, or text subdirectory)
CONTENT_DIR=$(find "$TEMP_DIR" -type d \( -iname "OEBPS" -o -iname "EPUB" -o -iname "ops" \))
if [ -z "$CONTENT_DIR" ]; then
# If the main content directory is not found, check for 'text' subdirectory
CONTENT_DIR=$(find "$TEMP_DIR" -type d -iname "text")
fi
if [ -z "$CONTENT_DIR" ]; then
echo "Content directory not found in $EPUB_FILE!"
rm -rf "$TEMP_DIR"
continue
fi
# Find all HTML/XHTML/XML files and process them
echo "Extracting text content to $TEXT_FILE"
> "$TEXT_FILE" # Empty the output file if it exists
find "$CONTENT_DIR" -type f \( -iname "*.html" -o -iname "*.xhtml" -o -iname "*.xml" \) | while IFS= read -r FILE; do
echo "Processing $FILE"
# Get the filename without the path
FILE_NAME=$(basename "$FILE")
# Append the filename as a heading to the output text file
echo -e "\n\n$FILE_NAME\n" >> "$TEXT_FILE"
# Extract text content from the HTML/XHTML/XML file
sed -n 's/<[^>]*>//gp' "$FILE" | grep -v '^[[:space:]]*$' >> "$TEXT_FILE"
done
# Verify if the file was written correctly
if [ ! -s "$TEXT_FILE" ]; then
echo "Warning: $TEXT_FILE is empty or not created correctly."
fi
# Clean up
rm -rf "$TEMP_DIR"
echo "Text extraction complete for $EPUB_FILE. Output saved to $TEXT_FILE"
done
# Check if a directory is provided
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <path-to-directory>"
exit 1
fi
DIRECTORY="$1"
# Check if the provided argument is a directory
if [ ! -d "$DIRECTORY" ]; then
echo "The specified path is not a directory."
exit 1
fi
# Process each EPUB file in the directory
find "$DIRECTORY" -type f -name "*.epub" | while IFS= read -r EPUB_FILE; do
# Extract the base name without the extension
BASE_NAME=$(basename "$EPUB_FILE" .epub)
TEXT_FILE="$DIRECTORY/${BASE_NAME}.txt"
echo "Processing $EPUB_FILE"
# Create a temporary directory to extract the EPUB file
TEMP_DIR=$(mktemp -d)
echo "Extracting EPUB file to $TEMP_DIR"
# Unzip the EPUB file into the temporary directory
unzip -q "$EPUB_FILE" -d "$TEMP_DIR"
# Find the content directory (OEBPS, EPUB, ops folder, or text subdirectory)
CONTENT_DIR=$(find "$TEMP_DIR" -type d \( -iname "OEBPS" -o -iname "EPUB" -o -iname "ops" \))
if [ -z "$CONTENT_DIR" ]; then
# If the main content directory is not found, check for 'text' subdirectory
CONTENT_DIR=$(find "$TEMP_DIR" -type d -iname "text")
fi
if [ -z "$CONTENT_DIR" ]; then
echo "Content directory not found in $EPUB_FILE!"
rm -rf "$TEMP_DIR"
continue
fi
# Find all HTML/XHTML/XML files and process them
echo "Extracting text content to $TEXT_FILE"
> "$TEXT_FILE" # Empty the output file if it exists
find "$CONTENT_DIR" -type f \( -iname "*.html" -o -iname "*.xhtml" -o -iname "*.xml" \) | while IFS= read -r FILE; do
echo "Processing $FILE"
# Get the filename without the path
FILE_NAME=$(basename "$FILE")
# Append the filename as a heading to the output text file
echo -e "\n\n$FILE_NAME\n" >> "$TEXT_FILE"
# Extract text content from the HTML/XHTML/XML file
sed -n 's/<[^>]*>//gp' "$FILE" | grep -v '^[[:space:]]*$' >> "$TEXT_FILE"
done
# Verify if the file was written correctly
if [ ! -s "$TEXT_FILE" ]; then
echo "Warning: $TEXT_FILE is empty or not created correctly."
fi
# Clean up
rm -rf "$TEMP_DIR"
echo "Text extraction complete for $EPUB_FILE. Output saved to $TEXT_FILE"
done