String operations with aho-corasick

This commit is contained in:
Wojciech Kozlowski 2024-02-18 16:49:58 +01:00
parent c1e634b473
commit 1cafca9048
3 changed files with 28 additions and 15 deletions

10
Cargo.lock generated
View File

@ -29,6 +29,15 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
@ -398,6 +407,7 @@ dependencies = [
name = "musichoard"
version = "0.1.0"
dependencies = [
"aho-corasick",
"crossterm",
"mockall",
"once_cell",

View File

@ -6,7 +6,9 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
aho-corasick = { version = "1.1.2", optional = true }
crossterm = { version = "0.27.0", optional = true}
once_cell = { version = "1.19.0", optional = true}
openssh = { version = "0.10.3", features = ["native-mux"], default-features = false, optional = true}
ratatui = { version = "0.26.0", optional = true}
serde = { version = "1.0.196", features = ["derive"], optional = true }
@ -27,7 +29,7 @@ bin = ["structopt"]
database-json = ["serde", "serde_json"]
library-beets = []
ssh-library = ["openssh", "tokio"]
tui = ["crossterm", "ratatui"]
tui = ["aho-corasick", "crossterm", "once_cell", "ratatui"]
[[bin]]
name = "musichoard"

View File

@ -1,3 +1,6 @@
use aho_corasick::AhoCorasick;
use once_cell::sync::Lazy;
use musichoard::collection::artist::Artist;
use crate::tui::{
@ -9,6 +12,16 @@ use crate::tui::{
lib::IMusicHoard,
};
// Unlikely that this covers all possible strings, but it should at least cover strings
// relevant for music (at least in English). The list of characters handled is based on
// https://wiki.musicbrainz.org/User:Yurim/Punctuation_and_Special_Characters.
//
// U+2010 hyphen, U+2012 figure dash, U+2013 en dash, U+2014 em dash, U+2015 horizontal bar, U+2018,
// U+2019, U+201C, U+201D, U+2026, U+2212 minus sign
static PATTERNS: [&'static str; 11] = ["", "", "", "", "", "", "", "", "", "", ""];
static REPLACE: [&'static str; 11] = ["-", "-", "-", "-", "-", "'", "'", "\"", "\"", "...", "-"];
static AC: Lazy<AhoCorasick> = Lazy::new(|| AhoCorasick::new(&PATTERNS).unwrap());
pub struct AppSearch {
string: String,
orig: ListSelection,
@ -160,12 +173,9 @@ impl<MH: IMusicHoard> IAppInteractSearchPrivate for AppMachine<MH, AppSearch> {
}
fn is_char_sensitive(artist_name: &str) -> bool {
let special_chars: &[char] = &['', '', '', '—', '―', '', '', '', '“', '”', '…'];
artist_name.chars().any(|ch| special_chars.contains(&ch))
AC.find(artist_name).is_some()
}
// FIXME: use aho_corasick for normalization - AhoCorasick does not implement PartialEq. It
// makes more sense to be places in app.rs as it would make ArtistSelection non-trivial.
fn normalize_search(search: &str, lowercase: bool, asciify: bool) -> String {
let normalized = if lowercase {
search.to_lowercase()
@ -173,17 +183,8 @@ impl<MH: IMusicHoard> IAppInteractSearchPrivate for AppMachine<MH, AppSearch> {
search.to_owned()
};
// Unlikely that this covers all possible strings, but it should at least cover strings
// relevant for music (at least in English). The list of characters handled is based on
// https://wiki.musicbrainz.org/User:Yurim/Punctuation_and_Special_Characters.
if asciify {
normalized
// U+2010 hyphen, U+2012 figure dash, U+2013 en dash, U+2014 em dash,
// U+2015 horizontal bar, U+2212 minus sign
.replace(['', '', '', '—', '―', ''], "-")
.replace(['', ''], "'") // U+2018, U+2019
.replace(['“', '”'], "\"") // U+201C, U+201D
.replace('…', "...") // U+2026
AC.replace_all(&normalized, &REPLACE)
} else {
normalized
}