Add fetch_url tool for link ingestion

This commit is contained in:
Your Name 2025-12-20 15:01:26 -06:00
parent e95bb18e29
commit 923131b681

View file

@ -4,6 +4,8 @@ from __future__ import annotations
from collections.abc import Callable
import json
import re
from html import unescape
from typing import Any, Literal
from groq._types import NOT_GIVEN
@ -142,6 +144,42 @@ def _searxng_tool() -> ChatCompletionToolParam:
return ChatCompletionToolParam(type="function", function=tool_spec)
def _fetch_tool() -> ChatCompletionToolParam:
tool_spec = FunctionDefinition(
name="fetch_url",
parameters={
"type": "object",
"properties": {
"url": {"type": "string"},
"max_chars": {"type": "integer", "default": 4000},
},
"required": ["url"],
},
description="Fetch a URL and return cleaned text content.",
)
return ChatCompletionToolParam(type="function", function=tool_spec)
def _strip_html(text: str) -> str:
# Cheap HTML-to-text for summarization; avoids extra deps.
text = re.sub(r"(?s)<script.*?>.*?</script>", " ", text)
text = re.sub(r"(?s)<style.*?>.*?</style>", " ", text)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\\s+", " ", text)
return unescape(text).strip()
async def _run_fetch(hass: HomeAssistant, url: str, max_chars: int) -> dict[str, Any]:
session = async_get_clientsession(hass)
async with session.get(url, timeout=20) as resp:
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
text = await resp.text()
cleaned = _strip_html(text)
if max_chars > 0:
cleaned = cleaned[:max_chars]
return {"url": url, "content_type": content_type, "text": cleaned}
async def _run_searxng(
hass: HomeAssistant,
options: dict[str, Any],
@ -258,6 +296,9 @@ class GroqdConversationEntity(
if tools is None:
tools = []
tools.append(_searxng_tool())
if tools is None:
tools = []
tools.append(_fetch_tool())
memory_scope = options.get(CONF_MEMORY_SCOPE, DEFAULT_MEMORY_SCOPE)
memory_key = None
@ -448,6 +489,17 @@ class GroqdConversationEntity(
tool_response = await _run_searxng(self.hass, options, tool_args)
except Exception as err:
tool_response = {"error": type(err).__name__, "error_text": str(err)}
elif tool_name == "fetch_url":
url = tool_args.get("url", "")
max_chars = tool_args.get("max_chars", 4000)
try:
max_chars = int(max_chars)
except (TypeError, ValueError):
max_chars = 4000
try:
tool_response = await _run_fetch(self.hass, url, max_chars)
except Exception as err:
tool_response = {"error": type(err).__name__, "error_text": str(err)}
elif llm_api:
tool_input = llm.ToolInput(
tool_name=tool_name,