A script to turn unencrypted epub files into plain text, bash

 #!/bin/bash
# Check if a directory is provided
if [ "$#" -ne 1 ]; then
    echo "Usage: $0 <path-to-directory>"
    exit 1
fi
DIRECTORY="$1"
# Check if the provided argument is a directory
if [ ! -d "$DIRECTORY" ]; then
    echo "The specified path is not a directory."
    exit 1
fi
# Process each EPUB file in the directory
find "$DIRECTORY" -type f -name "*.epub" | while IFS= read -r EPUB_FILE; do
    # Extract the base name without the extension
    BASE_NAME=$(basename "$EPUB_FILE" .epub)
    TEXT_FILE="$DIRECTORY/${BASE_NAME}.txt"
    echo "Processing $EPUB_FILE"
    
    # Create a temporary directory to extract the EPUB file
    TEMP_DIR=$(mktemp -d)
    echo "Extracting EPUB file to $TEMP_DIR"
    
    # Unzip the EPUB file into the temporary directory
    unzip -q "$EPUB_FILE" -d "$TEMP_DIR"
    
    # Find the content directory (OEBPS, EPUB, ops folder, or text subdirectory)
    CONTENT_DIR=$(find "$TEMP_DIR" -type d \( -iname "OEBPS" -o -iname "EPUB" -o -iname "ops" \))
    if [ -z "$CONTENT_DIR" ]; then
        # If the main content directory is not found, check for 'text' subdirectory
        CONTENT_DIR=$(find "$TEMP_DIR" -type d -iname "text")
    fi
    
    if [ -z "$CONTENT_DIR" ]; then
        echo "Content directory not found in $EPUB_FILE!"
        rm -rf "$TEMP_DIR"
        continue
    fi
    
    # Find all HTML/XHTML/XML files and process them
    echo "Extracting text content to $TEXT_FILE"
    > "$TEXT_FILE"  # Empty the output file if it exists
    
    find "$CONTENT_DIR" -type f \( -iname "*.html" -o -iname "*.xhtml" -o -iname "*.xml" \) | while IFS= read -r FILE; do
        echo "Processing $FILE"
        
        # Get the filename without the path
        FILE_NAME=$(basename "$FILE")
        
        # Append the filename as a heading to the output text file
        echo -e "\n\n$FILE_NAME\n" >> "$TEXT_FILE"
        
        # Extract text content from the HTML/XHTML/XML file
        sed -n 's/<[^>]*>//gp' "$FILE" | grep -v '^[[:space:]]*$' >> "$TEXT_FILE"
    done
    
    # Verify if the file was written correctly
    if [ ! -s "$TEXT_FILE" ]; then
        echo "Warning: $TEXT_FILE is empty or not created correctly."
    fi
    
    # Clean up
    rm -rf "$TEMP_DIR"
    
    echo "Text extraction complete for $EPUB_FILE. Output saved to $TEXT_FILE"
done

Popular posts from this blog

Pause a program which is using too much CPU

throttle traffic on apache

/var/log/journal taking up lots of space