#!/usr/bin/python3
#
# Copyright (c) 2019-2024 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#

import requests
from bs4 import BeautifulSoup
import os
from os import path

REPO_BASE = path.abspath(path.join(path.dirname(__file__), '..', '..'))
DOC_PATH = path.join(REPO_BASE, 'doc', 'html')

def list_doc_files():
    all_files = []
    for base_dir, _, files in os.walk(DOC_PATH):
        all_files += [path.join(base_dir, f) for f in files if f.endswith('.html')]
    return all_files

def get_href(elm, current_file):
    try:
        res = elm['href']
    except KeyError:
        return None
    if res.startswith('http://') or res.startswith('https://'):
        if '#error_er_' in res:
            return res.split('#error_er_')[0]
        else:
            return res
    else:
        curdir = path.dirname(current_file)
        return path.realpath(path.join(curdir, res.split('#')[0]))

def extract_links():
    external_links = {}
    internal_links = {}
    
    for fname in list_doc_files():
        with open(fname, 'rt') as f:
            html_doc = f.read()
        soup = BeautifulSoup(html_doc, 'html.parser')
        links = [get_href(elm, fname) for elm in soup.find_all('a')]
        internal_links.update({ elm: fname for elm in links if elm is not None and elm.startswith('/')})
        external_links.update({ elm: fname for elm in links if elm is not None and \
                              (elm.startswith('http://') or elm.startswith('https://'))})
        
    return (external_links, internal_links)

def check_external_links(links):
    s = requests.Session()
    for url in sorted(links.keys()):
        print('Checking ', url)
        response = s.head(url, allow_redirects=True)
        if response.status_code != 200:
            print('  ++++ {} response code: {}'.format(url, response.status_code))
            
def check_internal_links(links):
    for target, link_file in links.items():
        if not path.exists(target):
            print('  ++++ Link {} in file {} does not exist'.format(target, link_file))
            
def main():
    external, internal = extract_links()
    check_external_links(external)
    check_internal_links(internal)
    
if __name__ == '__main__':
    main()
