Revert "Stream documents"

This commit is contained in:
Tamo
2024-05-20 15:09:45 +02:00
committed by GitHub
parent 59ecf1cea7
commit 7e251b43d4
16 changed files with 91 additions and 151 deletions

View File

@ -30,7 +30,7 @@ grenad = { version = "0.4.6", default-features = false, features = [
"rayon",
"tempfile",
] }
heed = { version = "0.20.1", default-features = false, features = [
heed = { version = "0.20.0-alpha.9", default-features = false, features = [
"serde-json",
"serde-bincode",
"read-txn-no-tls",
@ -82,7 +82,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
] }
tiktoken-rs = "0.5.8"
liquid = "0.26.4"
arroy = "0.3.1"
arroy = "0.2.0"
rand = "0.8.5"
tracing = "0.1.40"
ureq = { version = "2.9.7", features = ["json"] }

View File

@ -1,3 +0,0 @@
target
corpus
artifacts

View File

@ -48,6 +48,8 @@ pub enum InternalError {
GrenadInvalidFormatVersion,
#[error("Invalid merge while processing {process}")]
IndexingMergingKeys { process: &'static str },
#[error("{}", HeedError::InvalidDatabaseTyping)]
InvalidDatabaseTyping,
#[error(transparent)]
RayonThreadPool(#[from] ThreadPoolBuildError),
#[error(transparent)]
@ -427,6 +429,7 @@ impl From<HeedError> for Error {
// TODO use the encoding
HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })),
HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })),
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions),
}

View File

@ -184,7 +184,7 @@ impl Index {
options.max_dbs(25);
let env = unsafe { options.open(path) }?;
let env = options.open(path)?;
let mut wtxn = env.write_txn()?;
let main = env.database_options().name(MAIN).create(&mut wtxn)?;
let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
@ -294,11 +294,6 @@ impl Index {
self.env.read_txn()
}
/// Create a static read transaction to be able to read the index without keeping a reference to it.
pub fn static_read_txn(&self) -> heed::Result<RoTxn<'static>> {
self.env.clone().static_read_txn()
}
/// Returns the canonicalized path where the heed `Env` of this `Index` lives.
pub fn path(&self) -> &Path {
self.env.path()

View File

@ -379,7 +379,7 @@ pub(crate) mod test_helpers {
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 1000 * 100);
let tempdir = tempfile::TempDir::new().unwrap();
let env = unsafe { options.open(tempdir.path()) }.unwrap();
let env = options.open(tempdir.path()).unwrap();
let mut wtxn = env.write_txn().unwrap();
let content = env.create_database(&mut wtxn, None).unwrap();
wtxn.commit().unwrap();

View File

@ -556,7 +556,7 @@ where
let writer_index = (embedder_index as u16) << 8;
for k in 0..=u8::MAX {
let writer =
arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension);
arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?;
if writer.is_empty(wtxn)? {
break;
}

View File

@ -661,7 +661,7 @@ pub(crate) fn write_typed_chunk_into_index(
)?;
let writer_index = (embedder_index as u16) << 8;
// FIXME: allow customizing distance
let writers: Vec<_> = (0..=u8::MAX)
let writers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
.map(|k| {
arroy::Writer::new(
index.vector_arroy,
@ -670,6 +670,7 @@ pub(crate) fn write_typed_chunk_into_index(
)
})
.collect();
let writers = writers?;
// remove vectors for docids we want them removed
let merger = remove_vectors_builder.build();