String operations with aho-corasick
This commit is contained in:
parent
c1e634b473
commit
1cafca9048
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -29,6 +29,15 @@ dependencies = [
|
|||||||
"zerocopy",
|
"zerocopy",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "allocator-api2"
|
name = "allocator-api2"
|
||||||
version = "0.2.16"
|
version = "0.2.16"
|
||||||
@ -398,6 +407,7 @@ dependencies = [
|
|||||||
name = "musichoard"
|
name = "musichoard"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
"crossterm",
|
"crossterm",
|
||||||
"mockall",
|
"mockall",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
@ -6,7 +6,9 @@ edition = "2021"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
aho-corasick = { version = "1.1.2", optional = true }
|
||||||
crossterm = { version = "0.27.0", optional = true}
|
crossterm = { version = "0.27.0", optional = true}
|
||||||
|
once_cell = { version = "1.19.0", optional = true}
|
||||||
openssh = { version = "0.10.3", features = ["native-mux"], default-features = false, optional = true}
|
openssh = { version = "0.10.3", features = ["native-mux"], default-features = false, optional = true}
|
||||||
ratatui = { version = "0.26.0", optional = true}
|
ratatui = { version = "0.26.0", optional = true}
|
||||||
serde = { version = "1.0.196", features = ["derive"], optional = true }
|
serde = { version = "1.0.196", features = ["derive"], optional = true }
|
||||||
@ -27,7 +29,7 @@ bin = ["structopt"]
|
|||||||
database-json = ["serde", "serde_json"]
|
database-json = ["serde", "serde_json"]
|
||||||
library-beets = []
|
library-beets = []
|
||||||
ssh-library = ["openssh", "tokio"]
|
ssh-library = ["openssh", "tokio"]
|
||||||
tui = ["crossterm", "ratatui"]
|
tui = ["aho-corasick", "crossterm", "once_cell", "ratatui"]
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "musichoard"
|
name = "musichoard"
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
use aho_corasick::AhoCorasick;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
use musichoard::collection::artist::Artist;
|
use musichoard::collection::artist::Artist;
|
||||||
|
|
||||||
use crate::tui::{
|
use crate::tui::{
|
||||||
@ -9,6 +12,16 @@ use crate::tui::{
|
|||||||
lib::IMusicHoard,
|
lib::IMusicHoard,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Unlikely that this covers all possible strings, but it should at least cover strings
|
||||||
|
// relevant for music (at least in English). The list of characters handled is based on
|
||||||
|
// https://wiki.musicbrainz.org/User:Yurim/Punctuation_and_Special_Characters.
|
||||||
|
//
|
||||||
|
// U+2010 hyphen, U+2012 figure dash, U+2013 en dash, U+2014 em dash, U+2015 horizontal bar, U+2018,
|
||||||
|
// U+2019, U+201C, U+201D, U+2026, U+2212 minus sign
|
||||||
|
static PATTERNS: [&'static str; 11] = ["‐", "‒", "–", "—", "―", "‘", "’", "“", "”", "…", "−"];
|
||||||
|
static REPLACE: [&'static str; 11] = ["-", "-", "-", "-", "-", "'", "'", "\"", "\"", "...", "-"];
|
||||||
|
static AC: Lazy<AhoCorasick> = Lazy::new(|| AhoCorasick::new(&PATTERNS).unwrap());
|
||||||
|
|
||||||
pub struct AppSearch {
|
pub struct AppSearch {
|
||||||
string: String,
|
string: String,
|
||||||
orig: ListSelection,
|
orig: ListSelection,
|
||||||
@ -160,12 +173,9 @@ impl<MH: IMusicHoard> IAppInteractSearchPrivate for AppMachine<MH, AppSearch> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn is_char_sensitive(artist_name: &str) -> bool {
|
fn is_char_sensitive(artist_name: &str) -> bool {
|
||||||
let special_chars: &[char] = &['‐', '‒', '–', '—', '―', '−', '‘', '’', '“', '”', '…'];
|
AC.find(artist_name).is_some()
|
||||||
artist_name.chars().any(|ch| special_chars.contains(&ch))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: use aho_corasick for normalization - AhoCorasick does not implement PartialEq. It
|
|
||||||
// makes more sense to be places in app.rs as it would make ArtistSelection non-trivial.
|
|
||||||
fn normalize_search(search: &str, lowercase: bool, asciify: bool) -> String {
|
fn normalize_search(search: &str, lowercase: bool, asciify: bool) -> String {
|
||||||
let normalized = if lowercase {
|
let normalized = if lowercase {
|
||||||
search.to_lowercase()
|
search.to_lowercase()
|
||||||
@ -173,17 +183,8 @@ impl<MH: IMusicHoard> IAppInteractSearchPrivate for AppMachine<MH, AppSearch> {
|
|||||||
search.to_owned()
|
search.to_owned()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Unlikely that this covers all possible strings, but it should at least cover strings
|
|
||||||
// relevant for music (at least in English). The list of characters handled is based on
|
|
||||||
// https://wiki.musicbrainz.org/User:Yurim/Punctuation_and_Special_Characters.
|
|
||||||
if asciify {
|
if asciify {
|
||||||
normalized
|
AC.replace_all(&normalized, &REPLACE)
|
||||||
// U+2010 hyphen, U+2012 figure dash, U+2013 en dash, U+2014 em dash,
|
|
||||||
// U+2015 horizontal bar, U+2212 minus sign
|
|
||||||
.replace(['‐', '‒', '–', '—', '―', '−'], "-")
|
|
||||||
.replace(['‘', '’'], "'") // U+2018, U+2019
|
|
||||||
.replace(['“', '”'], "\"") // U+201C, U+201D
|
|
||||||
.replace('…', "...") // U+2026
|
|
||||||
} else {
|
} else {
|
||||||
normalized
|
normalized
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user