Skip to main content

tuwunel_service/media/
preview.rs

1//! URL Previews
2//!
3//! This functionality is gated by 'url_preview', but not at the unit level for
4//! historical and simplicity reasons. Instead the feature gates the inclusion
5//! of dependencies and nulls out results through the existing interface when
6//! not featured.
7
8use std::{net::IpAddr, time::SystemTime};
9
10use ipaddress::IPAddress;
11use serde::Serialize;
12use tuwunel_core::{Err, Result, debug, err, implement};
13use url::{Host, Url};
14
15use super::Service;
16
17#[derive(Default, Serialize)]
18pub struct UrlPreviewData {
19	#[serde(
20		skip_serializing_if = "Option::is_none",
21		rename(serialize = "og:title")
22	)]
23	pub title: Option<String>,
24	#[serde(
25		skip_serializing_if = "Option::is_none",
26		rename(serialize = "og:description")
27	)]
28	pub description: Option<String>,
29	#[serde(
30		skip_serializing_if = "Option::is_none",
31		rename(serialize = "og:image")
32	)]
33	pub image: Option<String>,
34	#[serde(
35		skip_serializing_if = "Option::is_none",
36		rename(serialize = "matrix:image:size")
37	)]
38	pub image_size: Option<usize>,
39	#[serde(
40		skip_serializing_if = "Option::is_none",
41		rename(serialize = "og:image:width")
42	)]
43	pub image_width: Option<u32>,
44	#[serde(
45		skip_serializing_if = "Option::is_none",
46		rename(serialize = "og:image:height")
47	)]
48	pub image_height: Option<u32>,
49	#[serde(
50		skip_serializing_if = "Option::is_none",
51		rename(serialize = "og:video")
52	)]
53	pub video: Option<String>,
54	#[serde(
55		skip_serializing_if = "Option::is_none",
56		rename(serialize = "matrix:video:size")
57	)]
58	pub video_size: Option<usize>,
59	#[serde(
60		skip_serializing_if = "Option::is_none",
61		rename(serialize = "og:video:width")
62	)]
63	pub video_width: Option<u32>,
64	#[serde(
65		skip_serializing_if = "Option::is_none",
66		rename(serialize = "og:video:height")
67	)]
68	pub video_height: Option<u32>,
69	#[serde(
70		skip_serializing_if = "Option::is_none",
71		rename(serialize = "og:audio")
72	)]
73	pub audio: Option<String>,
74	#[serde(
75		skip_serializing_if = "Option::is_none",
76		rename(serialize = "matrix:audio:size")
77	)]
78	pub audio_size: Option<usize>,
79	#[serde(
80		skip_serializing_if = "Option::is_none",
81		rename(serialize = "og:type")
82	)]
83	pub og_type: Option<String>,
84	#[serde(
85		skip_serializing_if = "Option::is_none",
86		rename(serialize = "og:url")
87	)]
88	pub og_url: Option<String>,
89}
90
91#[implement(Service)]
92pub fn remove_url_preview(&self, url: &str) -> Result {
93	// TODO: also remove the downloaded image
94	self.db.remove_url_preview(url)
95}
96
97#[implement(Service)]
98pub fn set_url_preview(&self, url: &str, data: &UrlPreviewData) -> Result {
99	let now = SystemTime::now()
100		.duration_since(SystemTime::UNIX_EPOCH)
101		.expect("valid system time");
102	self.db.set_url_preview(url, data, now)
103}
104
105#[implement(Service)]
106pub async fn get_url_preview(&self, url: &Url) -> Result<UrlPreviewData> {
107	if let Ok(preview) = self.db.get_url_preview(url.as_str()).await {
108		return Ok(preview);
109	}
110
111	// ensure that only one request is made per URL
112	let _request_lock = self.url_preview_mutex.lock(url.as_str()).await;
113
114	match self.db.get_url_preview(url.as_str()).await {
115		| Ok(preview) => Ok(preview),
116		| Err(_) => self.request_url_preview(url).await,
117	}
118}
119
120#[implement(Service)]
121pub async fn request_url_preview(&self, url: &Url) -> Result<UrlPreviewData> {
122	self.check_url_host(url)?;
123
124	let client = &self.services.client.url_preview;
125	let response = client.get(url.as_str()).send().await?;
126
127	debug!(?url, "URL preview response headers: {:?}", response.headers());
128
129	if let Some(remote_addr) = response.remote_addr() {
130		debug!(?url, "URL preview response remote address: {:?}", remote_addr);
131
132		if let Ok(ip) = IPAddress::parse(remote_addr.ip().to_string())
133			&& !self.services.client.valid_cidr_range(&ip)
134		{
135			return Err!(Request(Forbidden("Requesting from this address is forbidden")));
136		}
137	}
138
139	let content_type = response
140		.headers()
141		.get(reqwest::header::CONTENT_TYPE)
142		.ok_or_else(|| err!(Request(Unknown("Missing Content-Type header"))))?
143		.to_str()
144		.map_err(|e| err!(Request(Unknown("Invalid Content-Type header: {e}"))))?
145		.to_owned();
146
147	let data = match content_type.as_str() {
148		| html if html.starts_with("text/html") => self.download_html(url, response).await?,
149		| img if img.starts_with("image/") => self.download_image(response).await?,
150		| _ => return Err!(Request(Unknown("Unsupported Content-Type"))),
151	};
152
153	self.set_url_preview(url.as_str(), &data)?;
154
155	Ok(data)
156}
157
158#[cfg(feature = "url_preview")]
159#[implement(Service)]
160pub async fn download_image(&self, response: reqwest::Response) -> Result<UrlPreviewData> {
161	use image::ImageReader;
162	use ruma::Mxc;
163	use tuwunel_core::utils::random_string;
164
165	let limit = self.services.config.max_response_size;
166	let image = crate::client::read_response_capped(response, limit).await?;
167	let mxc = Mxc {
168		server_name: self.services.globals.server_name(),
169		media_id: &random_string(super::MXC_LENGTH),
170	};
171
172	self.create(&mxc, None, None, None, &image)
173		.await?;
174
175	let cursor = std::io::Cursor::new(&image);
176	let (width, height) = match ImageReader::new(cursor).with_guessed_format() {
177		| Err(_) => (None, None),
178		| Ok(reader) => match reader.into_dimensions() {
179			| Err(_) => (None, None),
180			| Ok((width, height)) => (Some(width), Some(height)),
181		},
182	};
183
184	Ok(UrlPreviewData {
185		image: Some(mxc.to_string()),
186		image_size: Some(image.len()),
187		image_width: width,
188		image_height: height,
189		..Default::default()
190	})
191}
192
193#[cfg(not(feature = "url_preview"))]
194#[implement(Service)]
195#[expect(clippy::unused_async)]
196pub async fn download_image(&self, _response: reqwest::Response) -> Result<UrlPreviewData> {
197	Err!(FeatureDisabled("url_preview"))
198}
199
200#[cfg(feature = "url_preview")]
201#[implement(Service)]
202async fn download_html(
203	&self,
204	url: &Url,
205	mut response: reqwest::Response,
206) -> Result<UrlPreviewData> {
207	use webpage::HTML;
208
209	let mut bytes: Vec<u8> = Vec::new();
210	while let Some(chunk) = response.chunk().await? {
211		bytes.extend_from_slice(&chunk);
212		if bytes.len() > self.services.config.url_preview_max_spider_size {
213			debug!(
214				"Response body from URL {} exceeds url_preview_max_spider_size ({}), not \
215				 processing the rest of the response body and assuming our necessary data is in \
216				 this range.",
217				url, self.services.config.url_preview_max_spider_size
218			);
219			break;
220		}
221	}
222	let body = String::from_utf8_lossy(&bytes);
223	let Ok(html) = HTML::from_string(body.to_string(), Some(url.to_string())) else {
224		return Err!(Request(Unknown("Failed to parse HTML")));
225	};
226
227	// `webpage` does not resolve relative URLs in `og:` meta tags; resolve
228	// against the page URL so e.g. `og:image=test.png` becomes absolute.
229	let client = &self.services.client.url_preview;
230	let mut data = match html.opengraph.images.first() {
231		| None => UrlPreviewData::default(),
232		| Some(obj) => {
233			let image_url = url
234				.join(&obj.url)
235				.map_err(|e| err!(Request(Unknown("Invalid og:image URL: {e}"))))?;
236
237			self.check_url_host(&image_url)?;
238			let image_response = client.get(image_url.as_str()).send().await?;
239
240			if let Some(remote_addr) = image_response.remote_addr() {
241				debug!(?image_url, ?remote_addr, "og:image remote address");
242
243				if let Ok(ip) = IPAddress::parse(remote_addr.ip().to_string())
244					&& !self.services.client.valid_cidr_range(&ip)
245				{
246					return Err!(Request(Forbidden("Requesting from this address is forbidden")));
247				}
248			}
249
250			self.download_image(image_response).await?
251		},
252	};
253
254	let props = html.opengraph.properties;
255
256	/* use OpenGraph title/description, but fall back to HTML if not available */
257	data.title = props.get("title").cloned().or(html.title);
258	data.description = props
259		.get("description")
260		.cloned()
261		.or(html.description);
262	data.og_type = Some(html.opengraph.og_type);
263	data.og_url = props.get("url").cloned();
264
265	Ok(data)
266}
267
268#[cfg(not(feature = "url_preview"))]
269#[implement(Service)]
270#[expect(clippy::unused_async)]
271async fn download_html(
272	&self,
273	_url: &Url,
274	_response: reqwest::Response,
275) -> Result<UrlPreviewData> {
276	Err!(FeatureDisabled("url_preview"))
277}
278
279#[implement(Service)]
280pub(super) fn check_url_host(&self, url: &Url) -> Result {
281	let host = url
282		.host()
283		.ok_or_else(|| err!(Request(Unknown("URL has no host"))))?;
284
285	let ip = match host {
286		| Host::Domain(_) => return Ok(()),
287		| Host::Ipv4(v4) => IpAddr::V4(v4),
288		| Host::Ipv6(v6) => IpAddr::V6(v6),
289	};
290
291	if !self.services.client.valid_cidr_range_ip(ip) {
292		return Err!(Request(Forbidden("Requesting from this address is forbidden")));
293	}
294
295	Ok(())
296}
297
298#[implement(Service)]
299pub fn url_preview_allowed(&self, url: &Url) -> bool {
300	if ["http", "https"]
301		.iter()
302		.all(|&scheme| !scheme.eq_ignore_ascii_case(url.scheme()))
303	{
304		debug!("Ignoring non-HTTP/HTTPS URL to preview: {}", url);
305		return false;
306	}
307
308	let host = match url.host_str() {
309		| None => {
310			debug!("Ignoring URL preview for a URL that does not have a host (?): {}", url);
311			return false;
312		},
313		| Some(h) => h.to_owned(),
314	};
315
316	let allowlist_domain_contains = &self
317		.services
318		.config
319		.url_preview_domain_contains_allowlist;
320	let allowlist_domain_explicit = &self
321		.services
322		.config
323		.url_preview_domain_explicit_allowlist;
324	let denylist_domain_explicit = &self
325		.services
326		.config
327		.url_preview_domain_explicit_denylist;
328	let allowlist_url_contains = &self
329		.services
330		.config
331		.url_preview_url_contains_allowlist;
332
333	if allowlist_domain_contains.contains(&"*".to_owned())
334		|| allowlist_domain_explicit.contains(&"*".to_owned())
335		|| allowlist_url_contains.contains(&"*".to_owned())
336	{
337		debug!("Config key contains * which is allowing all URL previews. Allowing URL {}", url);
338		return true;
339	}
340
341	if !host.is_empty() {
342		if denylist_domain_explicit.contains(&host) {
343			debug!(
344				"Host {} is not allowed by url_preview_domain_explicit_denylist (check 1/4)",
345				&host
346			);
347			return false;
348		}
349
350		if allowlist_domain_explicit.contains(&host) {
351			debug!(
352				"Host {} is allowed by url_preview_domain_explicit_allowlist (check 2/4)",
353				&host
354			);
355			return true;
356		}
357
358		if allowlist_domain_contains
359			.iter()
360			.any(|domain_s| domain_s.contains(&host.clone()))
361		{
362			debug!(
363				"Host {} is allowed by url_preview_domain_contains_allowlist (check 3/4)",
364				&host
365			);
366			return true;
367		}
368
369		if allowlist_url_contains
370			.iter()
371			.any(|url_s| url.to_string().contains(url_s))
372		{
373			debug!("URL {} is allowed by url_preview_url_contains_allowlist (check 4/4)", &host);
374			return true;
375		}
376
377		// check root domain if available and if user has root domain checks
378		if self.services.config.url_preview_check_root_domain {
379			debug!("Checking root domain");
380			match host.split_once('.') {
381				| None => return false,
382				| Some((_, root_domain)) => {
383					if denylist_domain_explicit.contains(&root_domain.to_owned()) {
384						debug!(
385							"Root domain {} is not allowed by \
386							 url_preview_domain_explicit_denylist (check 1/3)",
387							&root_domain
388						);
389						return false;
390					}
391
392					if allowlist_domain_explicit.contains(&root_domain.to_owned()) {
393						debug!(
394							"Root domain {} is allowed by url_preview_domain_explicit_allowlist \
395							 (check 2/3)",
396							&root_domain
397						);
398						return true;
399					}
400
401					if allowlist_domain_contains
402						.iter()
403						.any(|domain_s| domain_s.contains(&root_domain.to_owned()))
404					{
405						debug!(
406							"Root domain {} is allowed by url_preview_domain_contains_allowlist \
407							 (check 3/3)",
408							&root_domain
409						);
410						return true;
411					}
412				},
413			}
414		}
415	}
416
417	false
418}