|
|
@@ -1,8 +1,12 @@
|
|
|
+use std::collections::BTreeSet;
|
|
|
+use std::time::{Duration, Instant};
|
|
|
+
|
|
|
+use reqwest::blocking::Client;
|
|
|
use runtime::{
|
|
|
edit_file, execute_bash, glob_search, grep_search, read_file, write_file, BashCommandInput,
|
|
|
GrepSearchInput,
|
|
|
};
|
|
|
-use serde::Deserialize;
|
|
|
+use serde::{Deserialize, Serialize};
|
|
|
use serde_json::{json, Value};
|
|
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
|
@@ -140,6 +144,40 @@ pub fn mvp_tool_specs() -> Vec<ToolSpec> {
|
|
|
"additionalProperties": false
|
|
|
}),
|
|
|
},
|
|
|
+ ToolSpec {
|
|
|
+ name: "WebFetch",
|
|
|
+ description:
|
|
|
+ "Fetch a URL, convert it into readable text, and answer a prompt about it.",
|
|
|
+ input_schema: json!({
|
|
|
+ "type": "object",
|
|
|
+ "properties": {
|
|
|
+ "url": { "type": "string", "format": "uri" },
|
|
|
+ "prompt": { "type": "string" }
|
|
|
+ },
|
|
|
+ "required": ["url", "prompt"],
|
|
|
+ "additionalProperties": false
|
|
|
+ }),
|
|
|
+ },
|
|
|
+ ToolSpec {
|
|
|
+ name: "WebSearch",
|
|
|
+ description: "Search the web for current information and return cited results.",
|
|
|
+ input_schema: json!({
|
|
|
+ "type": "object",
|
|
|
+ "properties": {
|
|
|
+ "query": { "type": "string", "minLength": 2 },
|
|
|
+ "allowed_domains": {
|
|
|
+ "type": "array",
|
|
|
+ "items": { "type": "string" }
|
|
|
+ },
|
|
|
+ "blocked_domains": {
|
|
|
+ "type": "array",
|
|
|
+ "items": { "type": "string" }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "required": ["query"],
|
|
|
+ "additionalProperties": false
|
|
|
+ }),
|
|
|
+ },
|
|
|
]
|
|
|
}
|
|
|
|
|
|
@@ -151,6 +189,8 @@ pub fn execute_tool(name: &str, input: &Value) -> Result<String, String> {
|
|
|
"edit_file" => from_value::<EditFileInput>(input).and_then(run_edit_file),
|
|
|
"glob_search" => from_value::<GlobSearchInputValue>(input).and_then(run_glob_search),
|
|
|
"grep_search" => from_value::<GrepSearchInput>(input).and_then(run_grep_search),
|
|
|
+ "WebFetch" => from_value::<WebFetchInput>(input).and_then(run_web_fetch),
|
|
|
+ "WebSearch" => from_value::<WebSearchInput>(input).and_then(run_web_search),
|
|
|
_ => Err(format!("unsupported tool: {name}")),
|
|
|
}
|
|
|
}
|
|
|
@@ -192,6 +232,14 @@ fn run_grep_search(input: GrepSearchInput) -> Result<String, String> {
|
|
|
to_pretty_json(grep_search(&input).map_err(io_to_string)?)
|
|
|
}
|
|
|
|
|
|
+fn run_web_fetch(input: WebFetchInput) -> Result<String, String> {
|
|
|
+ to_pretty_json(execute_web_fetch(&input)?)
|
|
|
+}
|
|
|
+
|
|
|
+fn run_web_search(input: WebSearchInput) -> Result<String, String> {
|
|
|
+ to_pretty_json(execute_web_search(&input)?)
|
|
|
+}
|
|
|
+
|
|
|
fn to_pretty_json<T: serde::Serialize>(value: T) -> Result<String, String> {
|
|
|
serde_json::to_string_pretty(&value).map_err(|error| error.to_string())
|
|
|
}
|
|
|
@@ -227,8 +275,411 @@ struct GlobSearchInputValue {
|
|
|
path: Option<String>,
|
|
|
}
|
|
|
|
|
|
+#[derive(Debug, Deserialize)]
|
|
|
+struct WebFetchInput {
|
|
|
+ url: String,
|
|
|
+ prompt: String,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Deserialize)]
|
|
|
+struct WebSearchInput {
|
|
|
+ query: String,
|
|
|
+ allowed_domains: Option<Vec<String>>,
|
|
|
+ blocked_domains: Option<Vec<String>>,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize)]
|
|
|
+struct WebFetchOutput {
|
|
|
+ bytes: usize,
|
|
|
+ code: u16,
|
|
|
+ #[serde(rename = "codeText")]
|
|
|
+ code_text: String,
|
|
|
+ result: String,
|
|
|
+ #[serde(rename = "durationMs")]
|
|
|
+ duration_ms: u128,
|
|
|
+ url: String,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize)]
|
|
|
+struct WebSearchOutput {
|
|
|
+ query: String,
|
|
|
+ results: Vec<WebSearchResultItem>,
|
|
|
+ #[serde(rename = "durationSeconds")]
|
|
|
+ duration_seconds: f64,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize)]
|
|
|
+#[serde(untagged)]
|
|
|
+enum WebSearchResultItem {
|
|
|
+ SearchResult {
|
|
|
+ tool_use_id: String,
|
|
|
+ content: Vec<SearchHit>,
|
|
|
+ },
|
|
|
+ Commentary(String),
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize)]
|
|
|
+struct SearchHit {
|
|
|
+ title: String,
|
|
|
+ url: String,
|
|
|
+}
|
|
|
+
|
|
|
+fn execute_web_fetch(input: &WebFetchInput) -> Result<WebFetchOutput, String> {
|
|
|
+ let started = Instant::now();
|
|
|
+ let client = build_http_client()?;
|
|
|
+ let request_url = normalize_fetch_url(&input.url)?;
|
|
|
+ let response = client
|
|
|
+ .get(request_url.clone())
|
|
|
+ .send()
|
|
|
+ .map_err(|error| error.to_string())?;
|
|
|
+
|
|
|
+ let status = response.status();
|
|
|
+ let final_url = response.url().to_string();
|
|
|
+ let code = status.as_u16();
|
|
|
+ let code_text = status.canonical_reason().unwrap_or("Unknown").to_string();
|
|
|
+ let content_type = response
|
|
|
+ .headers()
|
|
|
+ .get(reqwest::header::CONTENT_TYPE)
|
|
|
+ .and_then(|value| value.to_str().ok())
|
|
|
+ .unwrap_or_default()
|
|
|
+ .to_string();
|
|
|
+ let body = response.text().map_err(|error| error.to_string())?;
|
|
|
+ let bytes = body.len();
|
|
|
+ let normalized = normalize_fetched_content(&body, &content_type);
|
|
|
+ let result = summarize_web_fetch(&final_url, &input.prompt, &normalized);
|
|
|
+
|
|
|
+ Ok(WebFetchOutput {
|
|
|
+ bytes,
|
|
|
+ code,
|
|
|
+ code_text,
|
|
|
+ result,
|
|
|
+ duration_ms: started.elapsed().as_millis(),
|
|
|
+ url: final_url,
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+fn execute_web_search(input: &WebSearchInput) -> Result<WebSearchOutput, String> {
|
|
|
+ let started = Instant::now();
|
|
|
+ let client = build_http_client()?;
|
|
|
+ let search_url = build_search_url(&input.query)?;
|
|
|
+ let response = client
|
|
|
+ .get(search_url)
|
|
|
+ .send()
|
|
|
+ .map_err(|error| error.to_string())?;
|
|
|
+
|
|
|
+ let final_url = response.url().clone();
|
|
|
+ let html = response.text().map_err(|error| error.to_string())?;
|
|
|
+ let mut hits = extract_search_hits(&html);
|
|
|
+
|
|
|
+ if hits.is_empty() && final_url.host_str().is_some() {
|
|
|
+ hits = extract_search_hits_from_generic_links(&html);
|
|
|
+ }
|
|
|
+
|
|
|
+ if let Some(allowed) = input.allowed_domains.as_ref() {
|
|
|
+ hits.retain(|hit| host_matches_list(&hit.url, allowed));
|
|
|
+ }
|
|
|
+ if let Some(blocked) = input.blocked_domains.as_ref() {
|
|
|
+ hits.retain(|hit| !host_matches_list(&hit.url, blocked));
|
|
|
+ }
|
|
|
+
|
|
|
+ dedupe_hits(&mut hits);
|
|
|
+ hits.truncate(8);
|
|
|
+
|
|
|
+ let summary = if hits.is_empty() {
|
|
|
+ format!("No web search results matched the query {:?}.", input.query)
|
|
|
+ } else {
|
|
|
+ let rendered_hits = hits
|
|
|
+ .iter()
|
|
|
+ .map(|hit| format!("- [{}]({})", hit.title, hit.url))
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ .join("\n");
|
|
|
+ format!(
|
|
|
+ "Search results for {:?}. Include a Sources section in the final answer.\n{}",
|
|
|
+ input.query, rendered_hits
|
|
|
+ )
|
|
|
+ };
|
|
|
+
|
|
|
+ Ok(WebSearchOutput {
|
|
|
+ query: input.query.clone(),
|
|
|
+ results: vec![
|
|
|
+ WebSearchResultItem::Commentary(summary),
|
|
|
+ WebSearchResultItem::SearchResult {
|
|
|
+ tool_use_id: String::from("web_search_1"),
|
|
|
+ content: hits,
|
|
|
+ },
|
|
|
+ ],
|
|
|
+ duration_seconds: started.elapsed().as_secs_f64(),
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+fn build_http_client() -> Result<Client, String> {
|
|
|
+ Client::builder()
|
|
|
+ .timeout(Duration::from_secs(20))
|
|
|
+ .redirect(reqwest::redirect::Policy::limited(10))
|
|
|
+ .user_agent("clawd-rust-tools/0.1")
|
|
|
+ .build()
|
|
|
+ .map_err(|error| error.to_string())
|
|
|
+}
|
|
|
+
|
|
|
+fn normalize_fetch_url(url: &str) -> Result<String, String> {
|
|
|
+ let parsed = reqwest::Url::parse(url).map_err(|error| error.to_string())?;
|
|
|
+ if parsed.scheme() == "http" {
|
|
|
+ let host = parsed.host_str().unwrap_or_default();
|
|
|
+ if host != "localhost" && host != "127.0.0.1" && host != "::1" {
|
|
|
+ let mut upgraded = parsed;
|
|
|
+ upgraded
|
|
|
+ .set_scheme("https")
|
|
|
+ .map_err(|_| String::from("failed to upgrade URL to https"))?;
|
|
|
+ return Ok(upgraded.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(parsed.to_string())
|
|
|
+}
|
|
|
+
|
|
|
+fn build_search_url(query: &str) -> Result<reqwest::Url, String> {
|
|
|
+ if let Ok(base) = std::env::var("CLAWD_WEB_SEARCH_BASE_URL") {
|
|
|
+ let mut url = reqwest::Url::parse(&base).map_err(|error| error.to_string())?;
|
|
|
+ url.query_pairs_mut().append_pair("q", query);
|
|
|
+ return Ok(url);
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut url = reqwest::Url::parse("https://html.duckduckgo.com/html/")
|
|
|
+ .map_err(|error| error.to_string())?;
|
|
|
+ url.query_pairs_mut().append_pair("q", query);
|
|
|
+ Ok(url)
|
|
|
+}
|
|
|
+
|
|
|
+fn normalize_fetched_content(body: &str, content_type: &str) -> String {
|
|
|
+ if content_type.contains("html") {
|
|
|
+ html_to_text(body)
|
|
|
+ } else {
|
|
|
+ body.trim().to_string()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn summarize_web_fetch(url: &str, prompt: &str, content: &str) -> String {
|
|
|
+ let lower_prompt = prompt.to_lowercase();
|
|
|
+ let compact = collapse_whitespace(content);
|
|
|
+
|
|
|
+ let detail = if lower_prompt.contains("title") {
|
|
|
+ extract_title(content)
|
|
|
+ .map(|title| format!("Title: {title}"))
|
|
|
+ .unwrap_or_else(|| preview_text(&compact, 600))
|
|
|
+ } else if lower_prompt.contains("summary") || lower_prompt.contains("summarize") {
|
|
|
+ preview_text(&compact, 900)
|
|
|
+ } else {
|
|
|
+ let preview = preview_text(&compact, 900);
|
|
|
+ format!("Prompt: {prompt}\nContent preview:\n{preview}")
|
|
|
+ };
|
|
|
+
|
|
|
+ format!("Fetched {url}\n{detail}")
|
|
|
+}
|
|
|
+
|
|
|
+fn extract_title(content: &str) -> Option<String> {
|
|
|
+ for line in content.lines() {
|
|
|
+ let trimmed = line.trim();
|
|
|
+ if !trimmed.is_empty() {
|
|
|
+ return Some(trimmed.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ None
|
|
|
+}
|
|
|
+
|
|
|
+fn html_to_text(html: &str) -> String {
|
|
|
+ let mut text = String::with_capacity(html.len());
|
|
|
+ let mut in_tag = false;
|
|
|
+ let mut previous_was_space = false;
|
|
|
+
|
|
|
+ for ch in html.chars() {
|
|
|
+ match ch {
|
|
|
+ '<' => in_tag = true,
|
|
|
+ '>' => in_tag = false,
|
|
|
+ _ if in_tag => {}
|
|
|
+ '&' => {
|
|
|
+ text.push('&');
|
|
|
+ previous_was_space = false;
|
|
|
+ }
|
|
|
+ ch if ch.is_whitespace() => {
|
|
|
+ if !previous_was_space {
|
|
|
+ text.push(' ');
|
|
|
+ previous_was_space = true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ _ => {
|
|
|
+ text.push(ch);
|
|
|
+ previous_was_space = false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ collapse_whitespace(&decode_html_entities(&text))
|
|
|
+}
|
|
|
+
|
|
|
+fn decode_html_entities(input: &str) -> String {
|
|
|
+ input
|
|
|
+ .replace("&", "&")
|
|
|
+ .replace("<", "<")
|
|
|
+ .replace(">", ">")
|
|
|
+ .replace(""", "\"")
|
|
|
+ .replace("'", "'")
|
|
|
+ .replace(" ", " ")
|
|
|
+}
|
|
|
+
|
|
|
+fn collapse_whitespace(input: &str) -> String {
|
|
|
+ input.split_whitespace().collect::<Vec<_>>().join(" ")
|
|
|
+}
|
|
|
+
|
|
|
+fn preview_text(input: &str, max_chars: usize) -> String {
|
|
|
+ if input.chars().count() <= max_chars {
|
|
|
+ return input.to_string();
|
|
|
+ }
|
|
|
+ let shortened = input.chars().take(max_chars).collect::<String>();
|
|
|
+ format!("{}…", shortened.trim_end())
|
|
|
+}
|
|
|
+
|
|
|
+fn extract_search_hits(html: &str) -> Vec<SearchHit> {
|
|
|
+ let mut hits = Vec::new();
|
|
|
+ let mut remaining = html;
|
|
|
+
|
|
|
+ while let Some(anchor_start) = remaining.find("result__a") {
|
|
|
+ let after_class = &remaining[anchor_start..];
|
|
|
+ let Some(href_idx) = after_class.find("href=") else {
|
|
|
+ remaining = &after_class[1..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let href_slice = &after_class[href_idx + 5..];
|
|
|
+ let Some((url, rest)) = extract_quoted_value(href_slice) else {
|
|
|
+ remaining = &after_class[1..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let Some(close_tag_idx) = rest.find('>') else {
|
|
|
+ remaining = &after_class[1..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let after_tag = &rest[close_tag_idx + 1..];
|
|
|
+ let Some(end_anchor_idx) = after_tag.find("</a>") else {
|
|
|
+ remaining = &after_tag[1..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let title = html_to_text(&after_tag[..end_anchor_idx]);
|
|
|
+ if let Some(decoded_url) = decode_duckduckgo_redirect(&url) {
|
|
|
+ hits.push(SearchHit {
|
|
|
+ title: title.trim().to_string(),
|
|
|
+ url: decoded_url,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ remaining = &after_tag[end_anchor_idx + 4..];
|
|
|
+ }
|
|
|
+
|
|
|
+ hits
|
|
|
+}
|
|
|
+
|
|
|
+fn extract_search_hits_from_generic_links(html: &str) -> Vec<SearchHit> {
|
|
|
+ let mut hits = Vec::new();
|
|
|
+ let mut remaining = html;
|
|
|
+
|
|
|
+ while let Some(anchor_start) = remaining.find("<a") {
|
|
|
+ let after_anchor = &remaining[anchor_start..];
|
|
|
+ let Some(href_idx) = after_anchor.find("href=") else {
|
|
|
+ remaining = &after_anchor[2..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let href_slice = &after_anchor[href_idx + 5..];
|
|
|
+ let Some((url, rest)) = extract_quoted_value(href_slice) else {
|
|
|
+ remaining = &after_anchor[2..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let Some(close_tag_idx) = rest.find('>') else {
|
|
|
+ remaining = &after_anchor[2..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let after_tag = &rest[close_tag_idx + 1..];
|
|
|
+ let Some(end_anchor_idx) = after_tag.find("</a>") else {
|
|
|
+ remaining = &after_anchor[2..];
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let title = html_to_text(&after_tag[..end_anchor_idx]);
|
|
|
+ if title.trim().is_empty() {
|
|
|
+ remaining = &after_tag[end_anchor_idx + 4..];
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ let decoded_url = decode_duckduckgo_redirect(&url).unwrap_or(url);
|
|
|
+ if decoded_url.starts_with("http://") || decoded_url.starts_with("https://") {
|
|
|
+ hits.push(SearchHit {
|
|
|
+ title: title.trim().to_string(),
|
|
|
+ url: decoded_url,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ remaining = &after_tag[end_anchor_idx + 4..];
|
|
|
+ }
|
|
|
+
|
|
|
+ hits
|
|
|
+}
|
|
|
+
|
|
|
+fn extract_quoted_value(input: &str) -> Option<(String, &str)> {
|
|
|
+ let quote = input.chars().next()?;
|
|
|
+ if quote != '"' && quote != '\'' {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+ let rest = &input[quote.len_utf8()..];
|
|
|
+ let end = rest.find(quote)?;
|
|
|
+ Some((rest[..end].to_string(), &rest[end + quote.len_utf8()..]))
|
|
|
+}
|
|
|
+
|
|
|
+fn decode_duckduckgo_redirect(url: &str) -> Option<String> {
|
|
|
+ if url.starts_with("http://") || url.starts_with("https://") {
|
|
|
+ return Some(html_entity_decode_url(url));
|
|
|
+ }
|
|
|
+
|
|
|
+ let joined = if url.starts_with("//") {
|
|
|
+ format!("https:{url}")
|
|
|
+ } else if url.starts_with('/') {
|
|
|
+ format!("https://duckduckgo.com{url}")
|
|
|
+ } else {
|
|
|
+ return None;
|
|
|
+ };
|
|
|
+
|
|
|
+ let parsed = reqwest::Url::parse(&joined).ok()?;
|
|
|
+ if parsed.path() == "/l/" || parsed.path() == "/l" {
|
|
|
+ for (key, value) in parsed.query_pairs() {
|
|
|
+ if key == "uddg" {
|
|
|
+ return Some(html_entity_decode_url(value.as_ref()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Some(joined)
|
|
|
+}
|
|
|
+
|
|
|
+fn html_entity_decode_url(url: &str) -> String {
|
|
|
+ decode_html_entities(url)
|
|
|
+}
|
|
|
+
|
|
|
+fn host_matches_list(url: &str, domains: &[String]) -> bool {
|
|
|
+ let Ok(parsed) = reqwest::Url::parse(url) else {
|
|
|
+ return false;
|
|
|
+ };
|
|
|
+ let Some(host) = parsed.host_str() else {
|
|
|
+ return false;
|
|
|
+ };
|
|
|
+ domains.iter().any(|domain| {
|
|
|
+ let normalized = domain.trim().trim_start_matches('.');
|
|
|
+ host == normalized || host.ends_with(&format!(".{normalized}"))
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+fn dedupe_hits(hits: &mut Vec<SearchHit>) {
|
|
|
+ let mut seen = BTreeSet::new();
|
|
|
+ hits.retain(|hit| seen.insert(hit.url.clone()));
|
|
|
+}
|
|
|
+
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
+ use std::io::{Read, Write};
|
|
|
+ use std::net::{SocketAddr, TcpListener};
|
|
|
+ use std::sync::Arc;
|
|
|
+ use std::thread;
|
|
|
+ use std::time::Duration;
|
|
|
+
|
|
|
use super::{execute_tool, mvp_tool_specs};
|
|
|
use serde_json::json;
|
|
|
|
|
|
@@ -240,6 +691,8 @@ mod tests {
|
|
|
.collect::<Vec<_>>();
|
|
|
assert!(names.contains(&"bash"));
|
|
|
assert!(names.contains(&"read_file"));
|
|
|
+ assert!(names.contains(&"WebFetch"));
|
|
|
+ assert!(names.contains(&"WebSearch"));
|
|
|
}
|
|
|
|
|
|
#[test]
|
|
|
@@ -247,4 +700,167 @@ mod tests {
|
|
|
let error = execute_tool("nope", &json!({})).expect_err("tool should be rejected");
|
|
|
assert!(error.contains("unsupported tool"));
|
|
|
}
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn web_fetch_returns_prompt_aware_summary() {
|
|
|
+ let server = TestServer::spawn(Arc::new(|request_line: &str| {
|
|
|
+ assert!(request_line.starts_with("GET /page "));
|
|
|
+ HttpResponse::html(
|
|
|
+ 200,
|
|
|
+ "OK",
|
|
|
+ "<html><head><title>Ignored</title></head><body><h1>Test Page</h1><p>Hello <b>world</b> from local server.</p></body></html>",
|
|
|
+ )
|
|
|
+ }));
|
|
|
+
|
|
|
+ let result = execute_tool(
|
|
|
+ "WebFetch",
|
|
|
+ &json!({
|
|
|
+ "url": format!("http://{}/page", server.addr()),
|
|
|
+ "prompt": "Summarize this page"
|
|
|
+ }),
|
|
|
+ )
|
|
|
+ .expect("WebFetch should succeed");
|
|
|
+
|
|
|
+ let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
|
|
|
+ assert_eq!(output["code"], 200);
|
|
|
+ let summary = output["result"].as_str().expect("result string");
|
|
|
+ assert!(summary.contains("Fetched"));
|
|
|
+ assert!(summary.contains("Test Page"));
|
|
|
+ assert!(summary.contains("Hello world from local server"));
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn web_search_extracts_and_filters_results() {
|
|
|
+ let server = TestServer::spawn(Arc::new(|request_line: &str| {
|
|
|
+ assert!(request_line.contains("GET /search?q=rust+web+search "));
|
|
|
+ HttpResponse::html(
|
|
|
+ 200,
|
|
|
+ "OK",
|
|
|
+ r#"
|
|
|
+ <html><body>
|
|
|
+ <a class="result__a" href="https://docs.rs/reqwest">Reqwest docs</a>
|
|
|
+ <a class="result__a" href="https://example.com/blocked">Blocked result</a>
|
|
|
+ </body></html>
|
|
|
+ "#,
|
|
|
+ )
|
|
|
+ }));
|
|
|
+
|
|
|
+ std::env::set_var(
|
|
|
+ "CLAWD_WEB_SEARCH_BASE_URL",
|
|
|
+ format!("http://{}/search", server.addr()),
|
|
|
+ );
|
|
|
+ let result = execute_tool(
|
|
|
+ "WebSearch",
|
|
|
+ &json!({
|
|
|
+ "query": "rust web search",
|
|
|
+ "allowed_domains": ["docs.rs"],
|
|
|
+ "blocked_domains": ["example.com"]
|
|
|
+ }),
|
|
|
+ )
|
|
|
+ .expect("WebSearch should succeed");
|
|
|
+ std::env::remove_var("CLAWD_WEB_SEARCH_BASE_URL");
|
|
|
+
|
|
|
+ let output: serde_json::Value = serde_json::from_str(&result).expect("valid json");
|
|
|
+ assert_eq!(output["query"], "rust web search");
|
|
|
+ let results = output["results"].as_array().expect("results array");
|
|
|
+ let search_result = results
|
|
|
+ .iter()
|
|
|
+ .find(|item| item.get("content").is_some())
|
|
|
+ .expect("search result block present");
|
|
|
+ let content = search_result["content"].as_array().expect("content array");
|
|
|
+ assert_eq!(content.len(), 1);
|
|
|
+ assert_eq!(content[0]["title"], "Reqwest docs");
|
|
|
+ assert_eq!(content[0]["url"], "https://docs.rs/reqwest");
|
|
|
+ }
|
|
|
+
|
|
|
+ struct TestServer {
|
|
|
+ addr: SocketAddr,
|
|
|
+ shutdown: Option<std::sync::mpsc::Sender<()>>,
|
|
|
+ handle: Option<thread::JoinHandle<()>>,
|
|
|
+ }
|
|
|
+
|
|
|
+ impl TestServer {
|
|
|
+ fn spawn(handler: Arc<dyn Fn(&str) -> HttpResponse + Send + Sync + 'static>) -> Self {
|
|
|
+ let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
|
|
|
+ listener
|
|
|
+ .set_nonblocking(true)
|
|
|
+ .expect("set nonblocking listener");
|
|
|
+ let addr = listener.local_addr().expect("local addr");
|
|
|
+ let (tx, rx) = std::sync::mpsc::channel::<()>();
|
|
|
+
|
|
|
+ let handle = thread::spawn(move || loop {
|
|
|
+ if rx.try_recv().is_ok() {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ match listener.accept() {
|
|
|
+ Ok((mut stream, _)) => {
|
|
|
+ let mut buffer = [0_u8; 4096];
|
|
|
+ let size = stream.read(&mut buffer).expect("read request");
|
|
|
+ let request = String::from_utf8_lossy(&buffer[..size]).into_owned();
|
|
|
+ let request_line = request.lines().next().unwrap_or_default().to_string();
|
|
|
+ let response = handler(&request_line);
|
|
|
+ stream
|
|
|
+ .write_all(response.to_bytes().as_slice())
|
|
|
+ .expect("write response");
|
|
|
+ }
|
|
|
+ Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
|
|
|
+ thread::sleep(Duration::from_millis(10));
|
|
|
+ }
|
|
|
+ Err(error) => panic!("server accept failed: {error}"),
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ Self {
|
|
|
+ addr,
|
|
|
+ shutdown: Some(tx),
|
|
|
+ handle: Some(handle),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fn addr(&self) -> SocketAddr {
|
|
|
+ self.addr
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ impl Drop for TestServer {
|
|
|
+ fn drop(&mut self) {
|
|
|
+ if let Some(tx) = self.shutdown.take() {
|
|
|
+ let _ = tx.send(());
|
|
|
+ }
|
|
|
+ if let Some(handle) = self.handle.take() {
|
|
|
+ handle.join().expect("join test server");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ struct HttpResponse {
|
|
|
+ status: u16,
|
|
|
+ reason: &'static str,
|
|
|
+ content_type: &'static str,
|
|
|
+ body: String,
|
|
|
+ }
|
|
|
+
|
|
|
+ impl HttpResponse {
|
|
|
+ fn html(status: u16, reason: &'static str, body: &str) -> Self {
|
|
|
+ Self {
|
|
|
+ status,
|
|
|
+ reason,
|
|
|
+ content_type: "text/html; charset=utf-8",
|
|
|
+ body: body.to_string(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fn to_bytes(&self) -> Vec<u8> {
|
|
|
+ format!(
|
|
|
+ "HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
|
|
|
+ self.status,
|
|
|
+ self.reason,
|
|
|
+ self.content_type,
|
|
|
+ self.body.len(),
|
|
|
+ self.body
|
|
|
+ )
|
|
|
+ .into_bytes()
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|