Merge pull request #816 from MarinPostma/fix-index-length

Fix long documents not being indexed completely bug
This commit is contained in:
Clément Renault
2020-06-30 19:19:07 +02:00
committed by GitHub
4 changed files with 92 additions and 3 deletions

View File

@@ -124,7 +124,7 @@ fn index_token<A>(
) -> bool ) -> bool
where A: AsRef<[u8]>, where A: AsRef<[u8]>,
{ {
if token.word_index >= word_limit { if token.index >= word_limit {
return false; return false;
} }
@@ -277,4 +277,36 @@ mod tests {
.get(&"🇯🇵".to_owned().into_bytes()) .get(&"🇯🇵".to_owned().into_bytes())
.is_some()); .is_some());
} }
#[test]
// test sample from 807
fn very_long_text() {
let mut indexer = RawIndexer::new(fst::Set::default());
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let text = " The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests.\nEach entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means &ldquo;requests for example.com/&rdquo;, while '/admin' means &ldquo;requests for example.com/admin&rdquo;. If multiple blocks could match an incoming request then the most-specific will apply.\nweb:locations:&#39;/&#39;:# Rules for all requests that don&#39;t otherwise match....&#39;/sites/default/files&#39;:# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally:\nweb:locations:&#39;/&#39;:passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example:\nweb:locations:&#39;/&#39;:root:&#39;web&#39;passthru:&#39;/app.php&#39;This block will serve requests to / from the web directory in the application, and if a file doesn&rsquo;t exist on disk then the request will be forwarded to the /app.php script.\nA full list of the possible subkeys for locations is below.\n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported.\n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below.\n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location.\n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with &ldquo;ms&rdquo; (milliseconds), &ldquo;s&rdquo; (seconds), &ldquo;m&rdquo; (minutes), &ldquo;h&rdquo; (hours), &ldquo;d&rdquo; (days), &ldquo;w&rdquo; (weeks), &ldquo;M&rdquo; (months, 30d) or &ldquo;y&rdquo; (years, 365d).\n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP.\n allow: Whether to allow serving files which don&rsquo;t match a rule (true or false, default: true).\n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren&rsquo;t affected, to avoid overlap with the application&rsquo;s own ability to include custom headers in the response.\n rules: Specific overrides for a specific location. The key is a PCRE (regular expression) that is matched against the full request path.\n request_buffering: Most application servers do not support chunked requests (e.g. fpm, uwsgi), so Platform.sh enables request_buffering by default to handle them. That default configuration would look like this if it was present in .platform.app.yaml:\nweb:locations:&#39;/&#39;:passthru:truerequest_buffering:enabled:truemax_request_size:250mIf the application server can already efficiently handle chunked requests, the request_buffering subkey can be modified to disable it entirely (enabled: false). Additionally, applications that frequently deal with uploads greater than 250MB in size can update the max_request_size key to the application&rsquo;s needs. Note that modifications to request_buffering will need to be specified at each location where it is desired.\n ";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some());
}
#[test]
fn words_over_index_1000_not_indexed() {
let mut indexer = RawIndexer::new(fst::Set::default());
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let mut text = String::with_capacity(5000);
for _ in 0..1000 {
text.push_str("less ");
}
text.push_str("more");
indexer.index_text(docid, indexed_pos, &text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&"less".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes.get(&"more".to_owned().into_bytes()).is_none());
}
} }

View File

@@ -175,3 +175,23 @@ async fn check_add_documents_with_nested_sequence() {
assert_eq!(status_code, 200); assert_eq!(status_code, 200);
assert_eq!(response["hits"], body); assert_eq!(response["hits"], body);
} }
#[actix_rt::test]
// test sample from #807
async fn add_document_with_long_field() {
let mut server = common::Server::with_uid("test");
server.create_index(json!({ "uid": "test" })).await;
let body = json!([{
"documentId":"de1c2adbb897effdfe0deae32a01035e46f932ce",
"rank":1,
"relurl":"/configuration/app/web.html#locations",
"section":"Web",
"site":"docs",
"text":" The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests.\nEach entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means &ldquo;requests for example.com/&rdquo;, while '/admin' means &ldquo;requests for example.com/admin&rdquo;. If multiple blocks could match an incoming request then the most-specific will apply.\nweb:locations:&#39;/&#39;:# Rules for all requests that don&#39;t otherwise match....&#39;/sites/default/files&#39;:# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally:\nweb:locations:&#39;/&#39;:passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example:\nweb:locations:&#39;/&#39;:root:&#39;web&#39;passthru:&#39;/app.php&#39;This block will serve requests to / from the web directory in the application, and if a file doesn&rsquo;t exist on disk then the request will be forwarded to the /app.php script.\nA full list of the possible subkeys for locations is below.\n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported.\n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below.\n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location.\n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with &ldquo;ms&rdquo; (milliseconds), &ldquo;s&rdquo; (seconds), &ldquo;m&rdquo; (minutes), &ldquo;h&rdquo; (hours), &ldquo;d&rdquo; (days), &ldquo;w&rdquo; (weeks), &ldquo;M&rdquo; (months, 30d) or &ldquo;y&rdquo; (years, 365d).\n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP.\n allow: Whether to allow serving files which don&rsquo;t match a rule (true or false, default: true).\n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren&rsquo;t affected, to avoid overlap with the application&rsquo;s own ability to include custom headers in the response.\n rules: Specific overrides for a specific location. The key is a PCRE (regular expression) that is matched against the full request path.\n request_buffering: Most application servers do not support chunked requests (e.g. fpm, uwsgi), so Platform.sh enables request_buffering by default to handle them. That default configuration would look like this if it was present in .platform.app.yaml:\nweb:locations:&#39;/&#39;:passthru:truerequest_buffering:enabled:truemax_request_size:250mIf the application server can already efficiently handle chunked requests, the request_buffering subkey can be modified to disable it entirely (enabled: false). Additionally, applications that frequently deal with uploads greater than 250MB in size can update the max_request_size key to the application&rsquo;s needs. Note that modifications to request_buffering will need to be specified at each location where it is desired.\n ",
"title":"Locations",
"url":"/configuration/app/web.html#locations"
}]);
server.add_or_replace_multiple_documents(body).await;
let (response, _status) = server.search_post(json!({ "q": "request_buffering" })).await;
assert!(!response["hits"].as_array().unwrap().is_empty());
}

View File

@@ -196,11 +196,11 @@ async fn delete_index_and_recreate_it() {
let mut server = common::Server::with_uid("movies"); let mut server = common::Server::with_uid("movies");
// 0 - delete unexisting index is error // 0 - delete unexisting index is error
let (response, status_code) = server.delete_request("/indexes/test").await; let (response, status_code) = server.delete_request("/indexes/test").await;
assert_eq!(status_code, 404); assert_eq!(status_code, 404);
assert_eq!(&response["errorCode"], "index_not_found"); assert_eq!(&response["errorCode"], "index_not_found");
// 1 - Create a new index // 1 - Create a new index
let body = json!({ let body = json!({

View File

@@ -101,11 +101,14 @@ pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
#[derive(Debug, Copy, Clone, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> { pub struct Token<'a> {
pub word: &'a str, pub word: &'a str,
/// index of the token in the token sequence
pub index: usize,
pub word_index: usize, pub word_index: usize,
pub char_index: usize, pub char_index: usize,
} }
pub struct Tokenizer<'a> { pub struct Tokenizer<'a> {
count: usize,
inner: &'a str, inner: &'a str,
word_index: usize, word_index: usize,
char_index: usize, char_index: usize,
@@ -121,6 +124,7 @@ impl<'a> Tokenizer<'a> {
.fold((0, 0), chars_count_index); .fold((0, 0), chars_count_index);
Tokenizer { Tokenizer {
count: 0,
inner: &string[index..], inner: &string[index..],
word_index: 0, word_index: 0,
char_index: count, char_index: count,
@@ -150,6 +154,7 @@ impl<'a> Iterator for Tokenizer<'a> {
let token = Token { let token = Token {
word: string, word: string,
index: self.count,
word_index: self.word_index, word_index: self.word_index,
char_index: self.char_index, char_index: self.char_index,
}; };
@@ -158,6 +163,7 @@ impl<'a> Iterator for Tokenizer<'a> {
self.word_index += 1; self.word_index += 1;
} }
self.count += 1;
self.char_index += count; self.char_index += count;
self.inner = &self.inner[index..]; self.inner = &self.inner[index..];
@@ -175,6 +181,7 @@ where
{ {
inner: I, inner: I,
current: Option<Peekable<Tokenizer<'a>>>, current: Option<Peekable<Tokenizer<'a>>>,
count: usize,
word_offset: usize, word_offset: usize,
char_offset: usize, char_offset: usize,
} }
@@ -188,6 +195,7 @@ where
SeqTokenizer { SeqTokenizer {
inner: iter, inner: iter,
current, current,
count: 0,
word_offset: 0, word_offset: 0,
char_offset: 0, char_offset: 0,
} }
@@ -209,6 +217,7 @@ where
// to the token before returning it // to the token before returning it
let token = Token { let token = Token {
word: token.word, word: token.word,
index: self.count,
word_index: token.word_index + self.word_offset, word_index: token.word_index + self.word_offset,
char_index: token.char_index + self.char_offset, char_index: token.char_index + self.char_offset,
}; };
@@ -249,6 +258,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "salut", word: "salut",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@@ -261,6 +271,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@@ -276,6 +287,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 4 char_index: 4
}) })
@@ -284,6 +296,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolo", word: "lolo",
index: 1,
word_index: 1, word_index: 1,
char_index: 7 char_index: 7
}) })
@@ -292,6 +305,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "aïe", word: "aïe",
index: 2,
word_index: 9, word_index: 9,
char_index: 13 char_index: 13
}) })
@@ -300,6 +314,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "ouch", word: "ouch",
index: 3,
word_index: 17, word_index: 17,
char_index: 18 char_index: 18
}) })
@@ -312,6 +327,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@@ -320,6 +336,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolo", word: "lolo",
index: 1,
word_index: 8, word_index: 8,
char_index: 5 char_index: 5
}) })
@@ -328,6 +345,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "wtf", word: "wtf",
index: 2,
word_index: 16, word_index: 16,
char_index: 12 char_index: 12
}) })
@@ -336,6 +354,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lol", word: "lol",
index: 3,
word_index: 17, word_index: 17,
char_index: 18 char_index: 18
}) })
@@ -344,6 +363,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "aïe", word: "aïe",
index: 4,
word_index: 25, word_index: 25,
char_index: 24 char_index: 24
}) })
@@ -359,6 +379,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 4 char_index: 4
}) })
@@ -367,6 +388,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "😂", word: "😂",
index: 1,
word_index: 1, word_index: 1,
char_index: 7 char_index: 7
}) })
@@ -375,6 +397,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "aïe", word: "aïe",
index: 2,
word_index: 9, word_index: 9,
char_index: 10 char_index: 10
}) })
@@ -387,6 +410,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@@ -395,6 +419,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolo", word: "lolo",
index: 1,
word_index: 8, word_index: 8,
char_index: 5 char_index: 5
}) })
@@ -403,6 +428,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "😱", word: "😱",
index: 2,
word_index: 16, word_index: 16,
char_index: 12 char_index: 12
}) })
@@ -411,6 +437,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lol", word: "lol",
index: 3,
word_index: 17, word_index: 17,
char_index: 16 char_index: 16
}) })
@@ -419,6 +446,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "😣", word: "😣",
index: 4,
word_index: 25, word_index: 25,
char_index: 22 char_index: 22
}) })
@@ -434,6 +462,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec4}", word: "\u{2ec4}",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@@ -442,6 +471,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolilol", word: "lolilol",
index: 1,
word_index: 1, word_index: 1,
char_index: 1 char_index: 1
}) })
@@ -450,6 +480,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec7}", word: "\u{2ec7}",
index: 2,
word_index: 2, word_index: 2,
char_index: 8 char_index: 8
}) })
@@ -462,6 +493,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec4}", word: "\u{2ec4}",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@@ -470,6 +502,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ed3}", word: "\u{2ed3}",
index: 1,
word_index: 1, word_index: 1,
char_index: 1 char_index: 1
}) })
@@ -478,6 +511,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ef2}", word: "\u{2ef2}",
index: 2,
word_index: 2, word_index: 2,
char_index: 2 char_index: 2
}) })
@@ -486,6 +520,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolilol", word: "lolilol",
index: 3,
word_index: 3, word_index: 3,
char_index: 4 char_index: 4
}) })
@@ -494,6 +529,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "hello", word: "hello",
index: 4,
word_index: 4, word_index: 4,
char_index: 14 char_index: 14
}) })
@@ -502,6 +538,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec7}", word: "\u{2ec7}",
index: 5,
word_index: 5, word_index: 5,
char_index: 23 char_index: 23
}) })