Skip to content

Commit d222cba

Browse files
committed
Add new regex_capture scalar functions and regex_captures table function
1 parent fa0e6d2 commit d222cba

File tree

7 files changed

+587
-2
lines changed

7 files changed

+587
-2
lines changed

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,28 @@ from regex_find_all(
4242
*/
4343
```
4444

45+
**Extract capture group values by index or name**
46+
47+
```sql
48+
select
49+
regex_capture(captures, 0) as entire_match,
50+
regex_capture(captures, 'title') as title,
51+
regex_capture(captures, 'year') as year
52+
from regex_captures(
53+
regex("'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"),
54+
"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."
55+
);
56+
/*
57+
┌───────────────────────────┬──────────────────┬──────┐
58+
│ entire_match │ title │ year │
59+
├───────────────────────────┼──────────────────┼──────┤
60+
│ 'Citizen Kane' (1941) │ Citizen Kane │ 1941 │
61+
│ 'The Wizard of Oz' (1939) │ The Wizard of Oz │ 1939 │
62+
│ 'M' (1931) │ M │ 1931 │
63+
└───────────────────────────┴──────────────────┴──────┘
64+
*/
65+
```
66+
4567
**Use RegexSets to match a string on multiple patterns in linear time**
4668

4769
```sql

docs.md

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,147 @@ from regex_find_all(
103103
└───────┴───────┴─────┴───────────────┘
104104
```
105105
106+
<h3 name="regex_capture"><code>regex_capture(pattern, text, group)</code></h3>
107+
108+
Returns the text of the capture group with the specific `group` index or name, or NULL otherwise. Errors if `pattern` is not legal regex. Based on [`Regex.captures()`](https://docs.rs/regex/latest/regex/struct.Regex.html#method.captures).
109+
110+
If `group` is a number, then the N-th capture group is returned, where `0` refers to the entire match, `1` refers to the first left-most capture group in the match, `2` the second, and so on. If the provided group number "overflows', then NULL is returned.
111+
112+
```sql
113+
select regex_capture(
114+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
115+
"Not my favorite movie: 'Citizen Kane' (1941).",
116+
0
117+
);
118+
-- "'Citizen Kane' (1941)"
119+
120+
select regex_capture(
121+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
122+
"Not my favorite movie: 'Citizen Kane' (1941).",
123+
1
124+
);
125+
-- "Citizen Kane"
126+
127+
128+
select regex_capture(
129+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
130+
"Not my favorite movie: 'Citizen Kane' (1941).",
131+
2
132+
);
133+
-- "1941"
134+
135+
select regex_capture(
136+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
137+
"Not my favorite movie: 'Citizen Kane' (1941).",
138+
3
139+
);
140+
-- NULL
141+
```
142+
143+
If group is a string, then the value of the capture group with the same name is returned. If there is no matching capture group with the name, or the group was not captured, then NULL is returned.
144+
145+
```sql
146+
select regex_capture(
147+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
148+
"Not my favorite movie: 'Citizen Kane' (1941).",
149+
'title'
150+
);
151+
-- "Citizen Kane"
152+
153+
154+
select regex_capture(
155+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
156+
"Not my favorite movie: 'Citizen Kane' (1941).",
157+
'year'
158+
);
159+
-- "1941"
160+
161+
select regex_capture(
162+
"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)",
163+
"Not my favorite movie: 'Citizen Kane' (1941).",
164+
'not_exist'
165+
);
166+
-- NULL
167+
```
168+
169+
Note that there is a version of `regex_capture()` that only have two parameters: `captures` and `group`. This can only be used with the [`regex_captures`](#regex_captures) table function, with the special `captures` column like so:
170+
171+
```sql
172+
select
173+
regex_capture(captures, 'title') as title,
174+
regex_capture(captures, 'year') as year,
175+
regex_capture(captures, 'not_exist') as not_exist
176+
from regex_captures(
177+
regex("'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"),
178+
"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."
179+
);
180+
/*
181+
┌──────────────────┬──────┬───────────┐
182+
│ title │ year │ not_exist │
183+
├──────────────────┼──────┼───────────┤
184+
│ Citizen Kane │ 1941 │ │
185+
│ The Wizard of Oz │ 1939 │ │
186+
│ M │ 1931 │ │
187+
└──────────────────┴──────┴───────────┘
188+
*/
189+
```
190+
191+
<h3 name="regex_captures"><code>select * from regex_captures(pattern, text)</code></h3>
192+
193+
Returns all non-overlapping capture groups in the given text. Similar to [`regex_find_all`](#regex_find_all), but allows for extracting capture information. Must use with the [`regex_capture`](#regex_capture) function to extract capture group values. Based on [`Regex.captures_iter()`](https://docs.rs/regex/latest/regex/struct.Regex.html#method.captures_iter).
194+
195+
The returned columns:
196+
197+
- `rowid`: The 0-based index of the match. `0` is the entire match, `1` the first matching capture group, `2` the second, etc.
198+
- `captures`: A special value that's meant to be passed into [`regex_capture()`](#regex_capture). Will appear NULL through direct access.
199+
200+
For faster results, wrap the pattern with the [`regex()`](#regex) function for caching.
201+
202+
```sql
203+
select
204+
rowid,
205+
captures,
206+
regex_capture(captures, 0) as "0",
207+
regex_capture(captures, 1) as "1",
208+
regex_capture(captures, 2) as "2",
209+
regex_capture(captures, 3) as "3"
210+
from regex_captures(
211+
regex("'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"),
212+
"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."
213+
);
214+
/*
215+
┌───────┬──────────┬───────────────────────────┬──────────────────┬──────┬───┐
216+
│ rowid │ captures │ 0 │ 1 │ 2 │ 3 │
217+
├───────┼──────────┼───────────────────────────┼──────────────────┼──────┼───┤
218+
│ 0 │ │ 'Citizen Kane' (1941) │ Citizen Kane │ 1941 │ │
219+
│ 1 │ │ 'The Wizard of Oz' (1939) │ The Wizard of Oz │ 1939 │ │
220+
│ 2 │ │ 'M' (1931) │ M │ 1931 │ │
221+
└───────┴──────────┴───────────────────────────┴──────────────────┴──────┴───┘
222+
*/
223+
```
224+
225+
```sql
226+
select
227+
rowid,
228+
captures,
229+
regex_capture(captures, 'title') as title,
230+
regex_capture(captures, 'year') as year,
231+
regex_capture(captures, 'blah') as blah
232+
from regex_captures(
233+
regex("'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"),
234+
"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."
235+
);
236+
/*
237+
┌───────┬──────────┬──────────────────┬──────┬──────┐
238+
│ rowid │ captures │ title │ year │ blah │
239+
├───────┼──────────┼──────────────────┼──────┼──────┤
240+
│ 0 │ │ Citizen Kane │ 1941 │ │
241+
│ 1 │ │ The Wizard of Oz │ 1939 │ │
242+
│ 2 │ │ M │ 1931 │ │
243+
└───────┴──────────┴──────────────────┴──────┴──────┘
244+
*/
245+
```
246+
106247
<h3 name="regex_replace"><code>regex_replace(pattern, text, replacement)</code></h3>
107248

108249
Replace the **first** instance of `pattern` inside `text` with the given `replacement` text. Supports the [replacment string syntax](https://docs.rs/regex/latest/regex/struct.Regex.html#replacement-string-syntax). Based on [`Regex.replace()`](https://docs.rs/regex/latest/regex/struct.Regex.html#method.replace)

src/captures.rs

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
use regex::{Captures, Regex};
2+
use sqlite_loadable::{
3+
api,
4+
table::{ConstraintOperator, IndexInfo, VTab, VTabArguments, VTabCursor},
5+
BestIndexError, Result,
6+
};
7+
use sqlite_loadable::{prelude::*, Error};
8+
9+
use std::{mem, os::raw::c_int};
10+
11+
use crate::utils::{result_regex_captures, value_regex};
12+
13+
static CREATE_SQL: &str = "CREATE TABLE x(captures, pattern hidden, contents text hidden)";
14+
enum Columns {
15+
Captures,
16+
Pattern,
17+
Contents,
18+
}
19+
fn column(index: i32) -> Option<Columns> {
20+
match index {
21+
0 => Some(Columns::Captures),
22+
1 => Some(Columns::Pattern),
23+
2 => Some(Columns::Contents),
24+
_ => None,
25+
}
26+
}
27+
28+
#[repr(C)]
29+
pub struct RegexCapturesTable {
30+
/// must be first
31+
base: sqlite3_vtab,
32+
}
33+
34+
impl<'vtab> VTab<'vtab> for RegexCapturesTable {
35+
type Aux = ();
36+
type Cursor = RegexCapturesCursor<'vtab>;
37+
38+
fn connect(
39+
_db: *mut sqlite3,
40+
_aux: Option<&Self::Aux>,
41+
_args: VTabArguments,
42+
) -> Result<(String, RegexCapturesTable)> {
43+
let base: sqlite3_vtab = unsafe { mem::zeroed() };
44+
let vtab = RegexCapturesTable { base };
45+
// TODO db.config(VTabConfig::Innocuous)?;
46+
Ok((CREATE_SQL.to_owned(), vtab))
47+
}
48+
fn destroy(&self) -> Result<()> {
49+
Ok(())
50+
}
51+
52+
fn best_index(&self, mut info: IndexInfo) -> core::result::Result<(), BestIndexError> {
53+
let mut has_pattern = false;
54+
let mut has_contents = false;
55+
for mut constraint in info.constraints() {
56+
match column(constraint.column_idx()) {
57+
Some(Columns::Pattern) => {
58+
if constraint.usable() && constraint.op() == Some(ConstraintOperator::EQ) {
59+
constraint.set_omit(true);
60+
constraint.set_argv_index(1);
61+
has_pattern = true;
62+
} else {
63+
return Err(BestIndexError::Constraint);
64+
}
65+
}
66+
Some(Columns::Contents) => {
67+
if constraint.usable() && constraint.op() == Some(ConstraintOperator::EQ) {
68+
constraint.set_omit(true);
69+
constraint.set_argv_index(2);
70+
has_contents = true;
71+
} else {
72+
return Err(BestIndexError::Constraint);
73+
}
74+
}
75+
_ => (),
76+
}
77+
}
78+
if !has_pattern || !has_contents {
79+
return Err(BestIndexError::Error);
80+
}
81+
info.set_estimated_cost(100000.0);
82+
info.set_estimated_rows(100000);
83+
info.set_idxnum(2);
84+
85+
Ok(())
86+
}
87+
88+
fn open(&mut self) -> Result<RegexCapturesCursor<'_>> {
89+
Ok(RegexCapturesCursor::new())
90+
}
91+
}
92+
93+
#[repr(C)]
94+
pub struct RegexCapturesCursor<'vtab> {
95+
/// Base class. Must be first
96+
base: sqlite3_vtab_cursor,
97+
r_clone: Option<Regex>,
98+
all_captures: Option<Vec<Captures<'vtab>>>,
99+
curr: usize,
100+
}
101+
impl RegexCapturesCursor<'_> {
102+
fn new<'vtab>() -> RegexCapturesCursor<'vtab> {
103+
let base: sqlite3_vtab_cursor = unsafe { mem::zeroed() };
104+
RegexCapturesCursor {
105+
base,
106+
r_clone: None,
107+
all_captures: None,
108+
curr: 0,
109+
}
110+
}
111+
}
112+
113+
impl VTabCursor for RegexCapturesCursor<'_> {
114+
fn filter(
115+
&mut self,
116+
_idx_num: c_int,
117+
_idx_str: Option<&str>,
118+
values: &[*mut sqlite3_value],
119+
) -> Result<()> {
120+
let r = value_regex(
121+
values
122+
.get(0)
123+
.ok_or_else(|| Error::new_message("expected 1st argument as regex"))?,
124+
)?;
125+
let contents = api::value_text_notnull(
126+
values
127+
.get(1)
128+
.ok_or_else(|| Error::new_message("expected 2nd argument as contents"))?,
129+
)?;
130+
131+
let mut res = vec![];
132+
for captures in r.captures_iter(contents) {
133+
res.push(captures)
134+
}
135+
self.r_clone = Some((*r).clone());
136+
Box::into_raw(r);
137+
self.all_captures = Some(res);
138+
self.curr = 0;
139+
Ok(())
140+
}
141+
142+
fn next(&mut self) -> Result<()> {
143+
self.curr += 1;
144+
Ok(())
145+
}
146+
147+
fn eof(&self) -> bool {
148+
self.all_captures
149+
.as_ref()
150+
.map_or(true, |m| self.curr >= m.len())
151+
}
152+
153+
fn column(&self, context: *mut sqlite3_context, i: c_int) -> Result<()> {
154+
let captures = self
155+
.all_captures
156+
.as_ref()
157+
.ok_or_else(|| {
158+
Error::new_message("sqlite-regex internal error: self.all_captures is not defined")
159+
})?
160+
.get(self.curr)
161+
.ok_or_else(|| {
162+
Error::new_message(
163+
"sqlite-regex internal error: self.curr greater than all_captures result",
164+
)
165+
})?;
166+
match column(i) {
167+
Some(Columns::Captures) => {
168+
result_regex_captures(context, self.r_clone.as_ref().unwrap(), captures);
169+
}
170+
Some(Columns::Pattern) => (),
171+
Some(Columns::Contents) => (),
172+
None => (),
173+
}
174+
Ok(())
175+
}
176+
177+
fn rowid(&self) -> Result<i64> {
178+
Ok(self.curr as i64)
179+
}
180+
}

src/lib.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
mod captures;
12
mod find_all;
23
mod meta;
34
mod regex;
@@ -12,7 +13,10 @@ use sqlite_loadable::{
1213
define_scalar_function, define_table_function, errors::Result, FunctionFlags,
1314
};
1415

15-
use crate::{find_all::RegexFindAllTable, meta::*, regex::*, regexset::*, split::RegexSplitTable};
16+
use crate::{
17+
captures::RegexCapturesTable, find_all::RegexFindAllTable, meta::*, regex::*, regexset::*,
18+
split::RegexSplitTable,
19+
};
1620

1721
#[sqlite_entrypoint]
1822
pub fn sqlite3_regex_init(db: *mut sqlite3) -> Result<()> {
@@ -34,8 +38,12 @@ pub fn sqlite3_regex_init(db: *mut sqlite3) -> Result<()> {
3438
define_scalar_function(db, "regex_replace", 3, regex_replace, flags)?;
3539
define_scalar_function(db, "regex_replace_all", 3, regex_replace_all, flags)?;
3640

41+
define_scalar_function(db, "regex_capture", 3, regex_capture, flags)?;
42+
define_scalar_function(db, "regex_capture", 2, regex_capture2, flags)?;
43+
3744
define_table_function::<RegexFindAllTable>(db, "regex_find_all", None)?;
3845
define_table_function::<RegexSplitTable>(db, "regex_split", None)?;
46+
define_table_function::<RegexCapturesTable>(db, "regex_captures", None)?;
3947

4048
define_scalar_function(db, "regexset", -1, regexset, flags)?;
4149
define_scalar_function(db, "regexset_print", 1, regexset_print, flags)?;

0 commit comments

Comments
 (0)