mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			79 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
			
		
		
	
	
			79 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
use std::fs::File;
 | 
						|
use std::io::{Cursor, Read, Seek, Write};
 | 
						|
use std::path::{Path, PathBuf};
 | 
						|
use std::{env, fs};
 | 
						|
 | 
						|
use bytes::Bytes;
 | 
						|
use convert_case::{Case, Casing};
 | 
						|
use flate2::read::GzDecoder;
 | 
						|
use reqwest::IntoUrl;
 | 
						|
 | 
						|
const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
 | 
						|
 | 
						|
const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
 | 
						|
const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
 | 
						|
const DATASET_MOVIES: (&str, &str) = ("movies", "json");
 | 
						|
 | 
						|
/// The name of the environment variable used to select the path
 | 
						|
/// of the directory containing the datasets
 | 
						|
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
 | 
						|
 | 
						|
fn main() -> anyhow::Result<()> {
 | 
						|
    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
 | 
						|
 | 
						|
    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
 | 
						|
    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
 | 
						|
    write!(
 | 
						|
        manifest_paths_file,
 | 
						|
        r#"//! This file is generated by the build script.
 | 
						|
//! Do not modify by hand, use the build.rs file.
 | 
						|
#![allow(dead_code)]
 | 
						|
"#
 | 
						|
    )?;
 | 
						|
    writeln!(manifest_paths_file)?;
 | 
						|
 | 
						|
    for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
 | 
						|
        let out_path = out_dir.join(dataset);
 | 
						|
        let out_file = out_path.with_extension(extension);
 | 
						|
 | 
						|
        writeln!(
 | 
						|
            &mut manifest_paths_file,
 | 
						|
            r#"pub const {}: &str = {:?};"#,
 | 
						|
            dataset.to_case(Case::ScreamingSnake),
 | 
						|
            out_file.display(),
 | 
						|
        )?;
 | 
						|
 | 
						|
        if out_file.exists() {
 | 
						|
            eprintln!(
 | 
						|
                "The dataset {} already exists on the file system and will not be downloaded again",
 | 
						|
                out_path.display(),
 | 
						|
            );
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
        let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension);
 | 
						|
        eprintln!("downloading: {}", url);
 | 
						|
        let bytes = download_dataset(url.clone())?;
 | 
						|
        eprintln!("{} downloaded successfully", url);
 | 
						|
        eprintln!("uncompressing in {}", out_file.display());
 | 
						|
        uncompress_in_file(bytes, &out_file)?;
 | 
						|
    }
 | 
						|
 | 
						|
    Ok(())
 | 
						|
}
 | 
						|
 | 
						|
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
 | 
						|
    let bytes =
 | 
						|
        reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?;
 | 
						|
    Ok(Cursor::new(bytes))
 | 
						|
}
 | 
						|
 | 
						|
fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
 | 
						|
    let path = path.as_ref();
 | 
						|
    let mut gz = GzDecoder::new(bytes);
 | 
						|
    let mut dataset = Vec::new();
 | 
						|
    gz.read_to_end(&mut dataset)?;
 | 
						|
 | 
						|
    fs::write(path, dataset)?;
 | 
						|
    Ok(())
 | 
						|
}
 |