| | import logging |
| | import re |
| | import os |
| | from concurrent.futures import ThreadPoolExecutor, as_completed |
| | from typing import Dict, List, Union |
| | from urllib.parse import urljoin, urlparse |
| |
|
| | import requests |
| | from bs4 import BeautifulSoup |
| |
|
| | from huggingface_hub import HfApi |
| |
|
| | HF_API = HfApi(token=os.environ.get("TOKEN", None)) |
| |
|
| |
|
| | def get_base_url(url: str) -> str: |
| | """ |
| | Extracts the base URL from a given URL. |
| | |
| | Parameters: |
| | - url (str): The URL to extract the base URL from. |
| | |
| | Returns: |
| | - str: The base URL. |
| | """ |
| | parsed_url = urlparse(url) |
| | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" |
| | return base_url |
| |
|
| |
|
| | def get_domain_name(url: str) -> str: |
| | """ |
| | Get the domain name from a URL. |
| | |
| | Args: |
| | url (str): The URL. |
| | |
| | Returns: |
| | str: The domain name. |
| | """ |
| |
|
| | parsed_uri = urlparse(url) |
| | domain = "{uri.netloc}".format(uri=parsed_uri) |
| | if domain.startswith("www."): |
| | domain = domain[4:] |
| |
|
| | |
| | domain = ".".join(domain.split(".")[:-1]) |
| | |
| | return domain.capitalize() |
| |
|
| |
|
| | def get_favicon(url: str) -> str: |
| | headers = { |
| | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" |
| | } |
| | try: |
| | response = requests.get(url, headers=headers, timeout=2) |
| | if response.status_code == 200: |
| | soup = BeautifulSoup(response.content, "html.parser") |
| | |
| | icon_links = soup.find_all( |
| | "link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I) |
| | ) |
| | meta_icons = soup.find_all( |
| | "meta", attrs={"content": re.compile(r".ico$", re.I)} |
| | ) |
| | icons = icon_links + meta_icons |
| |
|
| | if icons: |
| | for icon in icons: |
| | favicon_url = icon.get("href") or icon.get("content") |
| | if favicon_url: |
| | if favicon_url.startswith("/"): |
| | favicon_url = urljoin(url, favicon_url) |
| | return favicon_url |
| | |
| | return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| | else: |
| | |
| | return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| | else: |
| | |
| | return ( |
| | "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| | ) |
| | except requests.Timeout: |
| | logging.warning(f"Request timed out for {url}") |
| | return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| | except Exception as e: |
| | logging.warning(f"An error occurred while fetching favicon for {url}: {e}") |
| | return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| |
|
| |
|
| | def download_favicons(urls: List[str]) -> Dict[str, str]: |
| | favicons = {} |
| | urls = list(set(urls)) |
| | with ThreadPoolExecutor(max_workers=20) as executor: |
| | future_to_url = {executor.submit(get_favicon, url): url for url in urls} |
| | for future in as_completed(future_to_url): |
| | url = future_to_url[future] |
| | try: |
| | favicon_url = future.result() |
| | favicons[url] = favicon_url |
| | except Exception as e: |
| | logging.warning(f"Failed to fetch favicon for {url}: {e}") |
| | favicons[url] = ( |
| | "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| | ) |
| | return favicons |
| |
|
| |
|
| | def url_exists(url): |
| | """ |
| | Checks if a URL exists by making a HEAD request. |
| | |
| | Parameters: |
| | - url (str): The URL to check. |
| | |
| | Returns: |
| | - bool: True if the URL exists, False otherwise. |
| | """ |
| | try: |
| | response = requests.head(url, allow_redirects=True) |
| | return response.status_code < 400 |
| | except requests.RequestException: |
| | |
| | return False |
| |
|
| |
|
| | def build_dataset_url(dataset_name: str): |
| | """ |
| | Build an HTML string with the dataset URL. |
| | """ |
| | url = f"https://huggingface.co/datasets/{dataset_name}" |
| | |
| | if url_exists(url) and HF_API.repo_exists(dataset_name, repo_type="dataset"): |
| | return url |
| | else: |
| | return None |
| |
|
| |
|
| | def build_model_url(model_name: str): |
| | """ |
| | Build an HTML string with the model URL. |
| | """ |
| | url = f"https://huggingface.co/{model_name}" |
| | |
| | if url_exists(url) and HF_API.repo_exists(model_name, repo_type="model"): |
| | return url |
| | else: |
| | return None |
| |
|
| |
|
| | def build_text_icon(text: str, url: Union[str, None], icon_url: str): |
| | if url is not None: |
| | return ( |
| | f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">' |
| | f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">' |
| | f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>' |
| | ) |
| | else: |
| | return text |
| |
|
| |
|
| | def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]: |
| | """ |
| | Build a dictionary of dataset URLs from a list of dataset names. |
| | |
| | Parameters: |
| | - datasets_names (List[str]): The list of dataset names. |
| | |
| | Returns: |
| | - Dict[str, str]: A dictionary of dataset URLs. |
| | """ |
| | return {dataset: build_dataset_url(dataset) for dataset in datasets_names} |
| |
|
| |
|
| | def build_models_urls(models_names: List[str]) -> Dict[str, str]: |
| | """ |
| | Build a dictionary of model URLs from a list of model names. |
| | |
| | Parameters: |
| | - models_names (List[str]): The list of model names. |
| | |
| | Returns: |
| | - Dict[str, str]: A dictionary of model URLs. |
| | """ |
| | return {model: build_model_url(model) for model in models_names} |
| |
|