From bff49cae38d4f7433fe749f084b4e0f49767ea90 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 12 Jun 2025 11:21:28 +0200 Subject: [PATCH] Add a binary to generate the embedder settings as a GitHub document --- crates/milli/src/bin/embedder_settings.rs | 423 ++++++++++++++++++++++ 1 file changed, 423 insertions(+) create mode 100644 crates/milli/src/bin/embedder_settings.rs diff --git a/crates/milli/src/bin/embedder_settings.rs b/crates/milli/src/bin/embedder_settings.rs new file mode 100644 index 000000000..fafbd2437 --- /dev/null +++ b/crates/milli/src/bin/embedder_settings.rs @@ -0,0 +1,423 @@ +use std::io::Write; + +use milli::vector::settings::{ + EmbedderSource, EmbeddingSettings, FieldStatus, MetaEmbeddingSetting, NestingContext, + ReindexOutcome, +}; + +pub trait Formatter { + fn begin_document(&mut self); + fn end_document(&mut self); + + fn begin_header(&mut self); + fn put_source_header(&mut self, source: EmbedderSource); + fn end_header(&mut self); + + fn begin_setting( + &mut self, + setting: MetaEmbeddingSetting, + description: &'static str, + kind: &'static str, + reindex_outcome: ReindexOutcome, + default_value: &'static str, + ); + fn end_setting(&mut self, setting: MetaEmbeddingSetting); + + fn put_setting_status( + &mut self, + source: EmbedderSource, + field_status_by_nesting_context: FieldStatusByNestingContext, + ); +} + +pub struct GitHubMdFormatter { + w: W, +} + +impl GitHubMdFormatter { + pub fn new(w: W) -> Self { + Self { w } + } +} + +impl Formatter for GitHubMdFormatter { + fn begin_document(&mut self) { + let s = r#" + + + + "#; + write!(self.w, "{s}").unwrap() + } + + fn end_document(&mut self) { + write!( + self.w, + r#" + +
+ "# + ) + .unwrap() + } + + fn begin_header(&mut self) { + write!( + self.w, + r#" + + +Setting +Description +Type +Default Value +Regenerate on Change +Availability for source + + + + "# + ) + .unwrap() + } + fn put_source_header(&mut self, source: EmbedderSource) { + write!( + self.w, + r#" + + +{source} + + + "# + ) + .unwrap() + } + fn end_header(&mut self) { + write!( + self.w, + r#" + + + "# + ) + .unwrap() + } + + fn begin_setting( + &mut self, + setting: MetaEmbeddingSetting, + description: &'static str, + kind: &'static str, + reindex_outcome: ReindexOutcome, + default_value: &'static str, + ) { + let name = setting.name(); + let reindex_outcome = match reindex_outcome { + ReindexOutcome::AlwaysReindex => "🏗️ Always", + ReindexOutcome::NeverReindex => "🌱 Never", + ReindexOutcome::ReindexSometimes(sometimes) => sometimes, + }; + write!( + self.w, + r#" + + + +`{name}` + + + + +{description} + + + + +{kind} + + + + +{default_value} + + + + +{reindex_outcome} + + + "# + ) + .unwrap() + } + + fn end_setting(&mut self, _setting: MetaEmbeddingSetting) { + write!( + self.w, + r#" + + + + "# + ) + .unwrap() + } + + fn put_setting_status( + &mut self, + _source: EmbedderSource, + field_status_by_nesting_context: FieldStatusByNestingContext, + ) { + let field_status = match field_status_by_nesting_context { + FieldStatusByNestingContext::Invariant(field_status) => { + format_field_status(field_status).to_string() + } + FieldStatusByNestingContext::Variant(variant_field_status_by_nesting_context) => { + format!( + r#" +- Usually, {} +- When used in `searchEmbedder` in a `composite` embedder, {} +- When used in `indexingEmbedder` in a `composite` embedder, {} + "#, + format_field_status(variant_field_status_by_nesting_context.not_nested), + format_field_status(variant_field_status_by_nesting_context.search), + format_field_status(variant_field_status_by_nesting_context.index) + ) + } + }; + write!( + self.w, + r#" + + +{field_status} + + + "# + ) + .unwrap(); + } +} + +fn format_field_status(field_status: FieldStatus) -> &'static str { + match field_status { + FieldStatus::Mandatory => "🔐 **Mandatory**", + FieldStatus::Allowed => "✅ Allowed", + FieldStatus::Disallowed => "🚫 Disallowed", + } +} + +pub struct GitHubMdAvailabilityFormatter(pub GitHubMdFormatter); +impl Formatter for GitHubMdAvailabilityFormatter { + fn begin_document(&mut self) { + write!(self.0.w, "## Availability of the settings depending on the selected source\n\n") + .unwrap(); + self.0.begin_document(); + } + + fn end_document(&mut self) { + self.0.end_document(); + } + + fn begin_header(&mut self) { + write!( + self.0.w, + r#" + + +Setting + "# + ) + .unwrap() + } + + fn put_source_header(&mut self, source: EmbedderSource) { + self.0.put_source_header(source); + } + + fn end_header(&mut self) { + self.0.end_header(); + } + + fn begin_setting( + &mut self, + setting: MetaEmbeddingSetting, + _description: &'static str, + _kind: &'static str, + _reindex_outcome: ReindexOutcome, + _default_value: &'static str, + ) { + if setting == MetaEmbeddingSetting::Source { + return; + } + let name = setting.name(); + write!( + self.0.w, + r#" + + + +`{name}` + + + "# + ) + .unwrap() + } + + fn end_setting(&mut self, setting: MetaEmbeddingSetting) { + if setting == MetaEmbeddingSetting::Source { + return; + } + self.0.end_setting(setting); + } + + fn put_setting_status( + &mut self, + source: EmbedderSource, + field_status_by_nesting_context: FieldStatusByNestingContext, + ) { + self.0.put_setting_status(source, field_status_by_nesting_context); + } +} + +pub struct GitHubMdBasicFormatter(pub GitHubMdFormatter); +impl Formatter for GitHubMdBasicFormatter { + fn begin_document(&mut self) { + write!(self.0.w, "## List of the embedder settings\n\n").unwrap(); + self.0.begin_document(); + } + + fn end_document(&mut self) { + self.0.end_document(); + } + + fn begin_header(&mut self) { + write!( + self.0.w, + r#" + + +Setting +Description +Type +Default Value +Regenerate on Change + "# + ) + .unwrap() + } + + fn put_source_header(&mut self, _source: EmbedderSource) {} + + fn end_header(&mut self) { + self.0.end_header(); + } + + fn begin_setting( + &mut self, + setting: MetaEmbeddingSetting, + description: &'static str, + kind: &'static str, + reindex_outcome: ReindexOutcome, + default_value: &'static str, + ) { + self.0.begin_setting(setting, description, kind, reindex_outcome, default_value); + } + + fn end_setting(&mut self, setting: MetaEmbeddingSetting) { + self.0.end_setting(setting); + } + + fn put_setting_status( + &mut self, + _source: EmbedderSource, + _field_status_by_nesting_context: FieldStatusByNestingContext, + ) { + } +} + +pub enum FieldStatusByNestingContext { + Invariant(FieldStatus), + Variant(VariantFieldStatusByNestingContext), +} + +pub struct VariantFieldStatusByNestingContext { + not_nested: FieldStatus, + search: FieldStatus, + index: FieldStatus, +} + +fn format_settings(mut fmt: impl Formatter) { + #![allow(unused_labels)] // the labels are used as documentation + fmt.begin_document(); + fmt.begin_header(); + for source in enum_iterator::all::() { + fmt.put_source_header(source); + } + fmt.end_header(); + 'setting: for setting in enum_iterator::all::() { + let description = setting.description(); + let kind = setting.kind(); + let reindex_outcome = setting.reindex_outcome(); + let default_value = setting.default_value(); + fmt.begin_setting(setting, description, kind, reindex_outcome, default_value); + + 'source: for source in enum_iterator::all::() { + if setting == MetaEmbeddingSetting::Source { + break 'source; + } + let mut field_status = VariantFieldStatusByNestingContext { + not_nested: FieldStatus::Disallowed, + search: FieldStatus::Disallowed, + index: FieldStatus::Disallowed, + }; + 'nesting: for nesting_context in enum_iterator::all::() { + let status = EmbeddingSettings::field_status(source, setting, nesting_context); + + match nesting_context { + NestingContext::NotNested => { + field_status.not_nested = status; + } + NestingContext::Search => { + field_status.search = status; + } + NestingContext::Indexing => { + field_status.index = status; + } + } + } + let field_status_by_nesting_context = if field_status.index == field_status.search + && field_status.search == field_status.not_nested + { + FieldStatusByNestingContext::Invariant(field_status.not_nested) + } else { + FieldStatusByNestingContext::Variant(field_status) + }; + fmt.put_setting_status(source, field_status_by_nesting_context); + } + fmt.end_setting(setting); + } + fmt.end_document(); +} + +fn main() { + let mut std_out = std::io::stdout().lock(); + + write!( + &mut std_out, + "The tables below have been generated by calling `cargo run --bin embedder_settings`\n\n" + ) + .unwrap(); + + let formatter = GitHubMdFormatter::new(&mut std_out); + let formatter = GitHubMdBasicFormatter(formatter); + format_settings(formatter); + + write!(&mut std_out, "\n\n").unwrap(); + + let formatter = GitHubMdFormatter::new(&mut std_out); + let formatter = GitHubMdAvailabilityFormatter(formatter); + format_settings(formatter); +}