<?php
/**
 * Sitemap Parser Class
 * Handles parsing of sitemap.xml files
 */

if (!defined('ABSPATH')) {
    exit;
}

class SEOINUX_Sitemap_Parser {

    /**
     * Parse sitemap and extract URLs with titles
     *
     * @param string $sitemap_url URL of the sitemap
     * @return array Array of links with URLs and titles
     */
    public function parse($sitemap_url) {
        $links = array();

        error_log('SEOINUX: Starting sitemap parse for: ' . $sitemap_url);

        // Fetch sitemap content
        $response = wp_remote_get($sitemap_url, array(
            'timeout' => 30,
            'sslverify' => false,
            'user-agent' => 'SEOINUX-Bot/2.0'
        ));

        if (is_wp_error($response)) {
            error_log('SEOINUX Sitemap Parser Error: ' . $response->get_error_message());
            return $links;
        }

        $status_code = wp_remote_retrieve_response_code($response);
        error_log('SEOINUX: HTTP Status: ' . $status_code);

        if ($status_code !== 200) {
            error_log('SEOINUX: Invalid HTTP status code: ' . $status_code);
            return $links;
        }

        $xml_content = wp_remote_retrieve_body($response);

        if (empty($xml_content)) {
            error_log('SEOINUX Sitemap Parser Error: Empty sitemap content');
            return $links;
        }

        error_log('SEOINUX: Sitemap content length: ' . strlen($xml_content));

        // Parse XML
        libxml_use_internal_errors(true);
        $xml = simplexml_load_string($xml_content);

        if ($xml === false) {
            error_log('SEOINUX Sitemap Parser Error: Failed to parse XML');
            foreach (libxml_get_errors() as $error) {
                error_log('XML Error: ' . $error->message);
            }
            libxml_clear_errors();
            return $links;
        }

        // Register namespaces
        $namespaces = $xml->getNamespaces(true);
        error_log('SEOINUX: Found namespaces: ' . print_r(array_keys($namespaces), true));

        // Get the default namespace
        $default_ns = isset($namespaces['']) ? $namespaces[''] : null;

        // Check if this is a sitemap index
        if (isset($xml->sitemap)) {
            error_log('SEOINUX: Detected sitemap index');
            // This is a sitemap index, parse each sitemap
            foreach ($xml->sitemap as $sitemap) {
                if ($default_ns) {
                    $sitemap->registerXPathNamespace('ns', $default_ns);
                    $loc_nodes = $sitemap->xpath('ns:loc');
                    $sitemap_loc = $loc_nodes ? (string) $loc_nodes[0] : (string) $sitemap->loc;
                } else {
                    $sitemap_loc = (string) $sitemap->loc;
                }

                error_log('SEOINUX: Parsing sub-sitemap: ' . $sitemap_loc);
                $sub_links = $this->parse($sitemap_loc);
                $links = array_merge($links, $sub_links);
            }
        } else {
            // This is a regular sitemap
            error_log('SEOINUX: Detected regular sitemap');

            // Handle namespaced URLs
            $url_elements = $xml->url;

            if (empty($url_elements) && $default_ns) {
                // Try with namespace
                $xml->registerXPathNamespace('ns', $default_ns);
                $url_elements = $xml->xpath('//ns:url');
            }

            $count = 0;
            foreach ($url_elements as $url) {
                if ($default_ns) {
                    $url->registerXPathNamespace('ns', $default_ns);
                    $loc_nodes = $url->xpath('ns:loc');
                    $loc = $loc_nodes ? (string) $loc_nodes[0] : (string) $url->loc;
                } else {
                    $loc = (string) $url->loc;
                }

                if (!empty($loc)) {
                    // Try to get title from various sources
                    $title = $this->extract_title($url, $namespaces, $loc);

                    $links[] = array(
                        'url' => $loc,
                        'title' => $title
                    );
                    $count++;
                }
            }

            error_log('SEOINUX: Extracted ' . $count . ' URLs from sitemap');
        }

        error_log('SEOINUX: Total links parsed: ' . count($links));
        return $links;
    }

    /**
     * Extract title from sitemap entry or fetch from URL
     *
     * @param SimpleXMLElement $url URL element from sitemap
     * @param array $namespaces XML namespaces
     * @param string $loc URL location
     * @return string Title of the page
     */
    private function extract_title($url, $namespaces, $loc) {
        $title = '';

        // Check for news title
        if (isset($namespaces['news'])) {
            $news = $url->children($namespaces['news']);
            if (isset($news->news)) {
                $title = (string) $news->news->title;
            }
        }

        // Check for image title
        if (empty($title) && isset($namespaces['image'])) {
            $image = $url->children($namespaces['image']);
            if (isset($image->image)) {
                $title = (string) $image->image->title;
            }
        }

        // If no title found in sitemap, try to get from WordPress
        if (empty($title)) {
            $post_id = url_to_postid($loc);
            if ($post_id) {
                $title = get_the_title($post_id);
            }
        }

        // If still no title, extract from URL
        if (empty($title)) {
            $title = $this->generate_title_from_url($loc);
        }

        return $title;
    }

    /**
     * Generate a title from URL slug
     *
     * @param string $url URL to extract title from
     * @return string Generated title
     */
    private function generate_title_from_url($url) {
        $path = parse_url($url, PHP_URL_PATH);
        $slug = basename($path);

        // Remove file extensions
        $slug = preg_replace('/\.(html|php|htm)$/', '', $slug);

        // Replace hyphens and underscores with spaces
        $title = str_replace(array('-', '_'), ' ', $slug);

        // Capitalize words
        $title = ucwords($title);

        return $title;
    }

    /**
     * Validate sitemap URL
     *
     * @param string $url URL to validate
     * @return bool True if valid, false otherwise
     */
    public function validate_sitemap_url($url) {
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            return false;
        }

        $response = wp_remote_head($url, array(
            'timeout' => 10,
            'sslverify' => false
        ));

        if (is_wp_error($response)) {
            return false;
        }

        $content_type = wp_remote_retrieve_header($response, 'content-type');

        // Check if content type is XML
        if (strpos($content_type, 'xml') !== false || strpos($content_type, 'text/plain') !== false) {
            return true;
        }

        return false;
    }
}
