const XRegExp = require('xregexp');

/**
 * Converts HTML content to plain text by removing HTML tags and handling block elements.
 * @param {string} content - The HTML content to convert.
 * @returns {string} - The plain text representation of the HTML content.
 */
export const convertHtmlToText = (content) => {
    const parser = new DOMParser();
    const doc = parser.parseFromString(content, 'text/html');

    // Clone the document body to manipulate without affecting the original content
    const clone = doc.body.cloneNode(true);

    /**
     * Inserts double newlines after block-level elements to simulate paragraph breaks and avoid sentences running together.
     * @param {HTMLElement} element - The root element to process.
     */
    const insertNewlinesAfterBlocks = (element) => {
        const blockTags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'section', 'article'];
        blockTags.forEach((tag) => {
            element.querySelectorAll(tag).forEach((el) => {
                el.insertAdjacentText('afterend', '\n\n');
            });
        });
    };

    // Add newlines after block-level elements
    insertNewlinesAfterBlocks(clone);

    // Extract and normalize the text content
    let text = clone.textContent || '';
    text = text.replace(/\s+/g, ' ').trim(); // Normalize whitespace

    return text;
};

/**
 * Counts words in the given content, with support for both HTML and plain text.
 * @param {string} content - The content to analyze.
 * @param {boolean} [isHtml=false] - Whether the content is HTML.
 * @returns {number} - The word count.
 */
export const countWords = (content, isHtml = false) => {
    if (!content) return 0;

    const contentText = isHtml ? convertHtmlToText(content) : content;

    // Regular expression to match words, including those with special characters
    const wordPattern = XRegExp("[\\p{L}\\p{M}0-9′‵ʹʼˈ`'\\-°’]+", 'gu');
    const symbolPattern = XRegExp("\\s+[′‵ʹʼˈ`'\\-°’]+\\s+", 'gu');

    // Match words with special characters
    const words = XRegExp.match(contentText, wordPattern) || [];

    // Match symbols that should not be counted as words
    const symbols = XRegExp.match(contentText, symbolPattern) || [];

    // Subtract the number of symbol-only matches surrounded by spaces
    return words.length - symbols.length;
};

/**
 * Gets the word count of the selected text in the editor.
 * @param {Object} editor - The editor instance.
 * @returns {number} - The word count of the selected text.
 */
export const getSelectedTextWordCount = (editor) => {
    const { state } = editor;
    const { selection } = state;

    if (selection.empty) {
        return 0;
    }

    const selectedText = state.doc.textBetween(selection.from, selection.to, ' ', ' ');

    return countWords(selectedText, true);
};
