mirror of
https://github.com/karakeep-app/karakeep.git
synced 2026-02-28 18:25:55 +01:00
Add a new search qualifier `is:broken` that allows users to filter bookmarks with broken or failed links. This matches the functionality on the broken links settings page, where a link is considered broken if: - crawlStatus is "failure" - crawlStatusCode is less than 200 - crawlStatusCode is greater than 299 The qualifier supports negation with `-is:broken` to find working links. Changes: - Add brokenLinks matcher type definition - Update search query parser to handle is:broken qualifier - Implement query execution logic for broken links filtering - Add autocomplete support with translations - Add parser tests - Update search query language documentation Co-authored-by: Claude <noreply@anthropic.com>
427 lines
11 KiB
TypeScript
427 lines
11 KiB
TypeScript
import {
|
|
alt,
|
|
alt_sc,
|
|
apply,
|
|
kleft,
|
|
kmid,
|
|
kright,
|
|
lrec_sc,
|
|
opt,
|
|
rule,
|
|
seq,
|
|
str,
|
|
tok,
|
|
Token,
|
|
TokenPosition,
|
|
} from "typescript-parsec";
|
|
import { z } from "zod";
|
|
|
|
import { BookmarkTypes } from "./types/bookmarks";
|
|
import { Matcher } from "./types/search";
|
|
import { parseRelativeDate } from "./utils/relativeDateUtils";
|
|
|
|
enum TokenType {
|
|
And = "AND",
|
|
Or = "OR",
|
|
|
|
Qualifier = "QUALIFIER",
|
|
Ident = "IDENT",
|
|
StringLiteral = "STRING_LITERAL",
|
|
|
|
LParen = "LPAREN",
|
|
RParen = "RPAREN",
|
|
Space = "SPACE",
|
|
Hash = "HASH",
|
|
Minus = "MINUS",
|
|
}
|
|
|
|
// Rules are in order of priority
|
|
const lexerRules: [RegExp, TokenType][] = [
|
|
[/^\s+and/i, TokenType.And],
|
|
[/^\s+or/i, TokenType.Or],
|
|
|
|
[/^#/, TokenType.Hash],
|
|
[/^(is|url|list|after|before|age|feed|title):/, TokenType.Qualifier],
|
|
|
|
[/^"([^"]+)"/, TokenType.StringLiteral],
|
|
|
|
[/^\(/, TokenType.LParen],
|
|
[/^\)/, TokenType.RParen],
|
|
[/^\s+/, TokenType.Space],
|
|
[/^-/, TokenType.Minus],
|
|
|
|
// This needs to be last as it matches a lot of stuff
|
|
[/^[^ )(]+/, TokenType.Ident],
|
|
] as const;
|
|
|
|
class LexerToken implements Token<TokenType> {
|
|
private constructor(
|
|
private readonly input: string,
|
|
public kind: TokenType,
|
|
public text: string,
|
|
public pos: TokenPosition,
|
|
) {}
|
|
|
|
public static from(input: string): Token<TokenType> | undefined {
|
|
const tok = new LexerToken(
|
|
input,
|
|
/* Doesn't matter */ TokenType.Ident,
|
|
"",
|
|
{
|
|
index: 0,
|
|
rowBegin: 1,
|
|
rowEnd: 1,
|
|
columnBegin: 0,
|
|
columnEnd: 0,
|
|
},
|
|
);
|
|
return tok.next;
|
|
}
|
|
|
|
public get next(): Token<TokenType> | undefined {
|
|
if (!this.input.length) {
|
|
return undefined;
|
|
}
|
|
|
|
for (const [regex, tokenType] of lexerRules) {
|
|
const matchRes = regex.exec(this.input);
|
|
if (!matchRes) {
|
|
continue;
|
|
}
|
|
const match = matchRes[0];
|
|
return new LexerToken(this.input.slice(match.length), tokenType, match, {
|
|
index: this.pos.index + match.length,
|
|
columnBegin: this.pos.index + 1,
|
|
columnEnd: this.pos.index + 1 + match.length,
|
|
// Our strings are always only one line
|
|
rowBegin: 1,
|
|
rowEnd: 1,
|
|
});
|
|
}
|
|
// No match
|
|
throw new Error(
|
|
`Failed to tokenize the token at position ${this.pos.index}: ${this.input[0]}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
export interface TextAndMatcher {
|
|
text: string;
|
|
matcher?: Matcher;
|
|
}
|
|
|
|
const MATCHER = rule<TokenType, TextAndMatcher>();
|
|
const EXP = rule<TokenType, TextAndMatcher>();
|
|
|
|
MATCHER.setPattern(
|
|
alt_sc(
|
|
apply(
|
|
seq(opt(str("-")), kright(str("is:"), tok(TokenType.Ident))),
|
|
([minus, ident]) => {
|
|
switch (ident.text) {
|
|
case "fav":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "favourited", favourited: !minus },
|
|
};
|
|
case "archived":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "archived", archived: !minus },
|
|
};
|
|
case "tagged":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "tagged", tagged: !minus },
|
|
};
|
|
case "inlist":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "inlist", inList: !minus },
|
|
};
|
|
case "link":
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "type",
|
|
typeName: BookmarkTypes.LINK,
|
|
inverse: !!minus,
|
|
},
|
|
};
|
|
case "text":
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "type",
|
|
typeName: BookmarkTypes.TEXT,
|
|
inverse: !!minus,
|
|
},
|
|
};
|
|
case "media":
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "type",
|
|
typeName: BookmarkTypes.ASSET,
|
|
inverse: !!minus,
|
|
},
|
|
};
|
|
case "broken":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "brokenLinks", brokenLinks: !minus },
|
|
};
|
|
default:
|
|
// If the token is not known, emit it as pure text
|
|
return {
|
|
text: `${minus?.text ?? ""}is:${ident.text}`,
|
|
matcher: undefined,
|
|
};
|
|
}
|
|
},
|
|
),
|
|
apply(
|
|
seq(
|
|
opt(str("-")),
|
|
alt(tok(TokenType.Qualifier), tok(TokenType.Hash)),
|
|
alt(
|
|
apply(tok(TokenType.Ident), (tok) => {
|
|
return tok.text;
|
|
}),
|
|
apply(tok(TokenType.StringLiteral), (tok) => {
|
|
return tok.text.slice(1, -1);
|
|
}),
|
|
),
|
|
),
|
|
([minus, qualifier, ident]) => {
|
|
switch (qualifier.text) {
|
|
case "url:":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "url", url: ident, inverse: !!minus },
|
|
};
|
|
case "title:":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "title", title: ident, inverse: !!minus },
|
|
};
|
|
case "#":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "tagName", tagName: ident, inverse: !!minus },
|
|
};
|
|
case "list:":
|
|
return {
|
|
text: "",
|
|
matcher: { type: "listName", listName: ident, inverse: !!minus },
|
|
};
|
|
case "feed:":
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "rssFeedName",
|
|
feedName: ident,
|
|
inverse: !!minus,
|
|
},
|
|
};
|
|
case "after:":
|
|
try {
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "dateAfter",
|
|
dateAfter: z.coerce.date().parse(ident),
|
|
inverse: !!minus,
|
|
},
|
|
};
|
|
} catch {
|
|
return {
|
|
// If parsing the date fails, emit it as pure text
|
|
text: (minus?.text ?? "") + qualifier.text + ident,
|
|
matcher: undefined,
|
|
};
|
|
}
|
|
case "before:":
|
|
try {
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "dateBefore",
|
|
dateBefore: z.coerce.date().parse(ident),
|
|
inverse: !!minus,
|
|
},
|
|
};
|
|
} catch {
|
|
return {
|
|
// If parsing the date fails, emit it as pure text
|
|
text: (minus?.text ?? "") + qualifier.text + ident,
|
|
matcher: undefined,
|
|
};
|
|
}
|
|
case "age:":
|
|
try {
|
|
const { direction, amount, unit } = parseRelativeDate(ident);
|
|
return {
|
|
text: "",
|
|
matcher: {
|
|
type: "age",
|
|
relativeDate: { direction, amount, unit },
|
|
},
|
|
};
|
|
} catch {
|
|
return {
|
|
// If parsing the relative time fails, emit it as pure text
|
|
text: (minus?.text ?? "") + qualifier.text + ident,
|
|
matcher: undefined,
|
|
};
|
|
}
|
|
default:
|
|
// If the token is not known, emit it as pure text
|
|
return {
|
|
text: (minus?.text ?? "") + qualifier.text + ident,
|
|
matcher: undefined,
|
|
};
|
|
}
|
|
},
|
|
),
|
|
// Ident or an incomlete qualifier
|
|
apply(alt(tok(TokenType.Ident), tok(TokenType.Qualifier)), (toks) => {
|
|
return {
|
|
text: toks.text,
|
|
matcher: undefined,
|
|
};
|
|
}),
|
|
kmid(tok(TokenType.LParen), EXP, tok(TokenType.RParen)),
|
|
),
|
|
);
|
|
|
|
EXP.setPattern(
|
|
lrec_sc(
|
|
MATCHER,
|
|
seq(
|
|
alt(
|
|
tok(TokenType.Space),
|
|
kleft(tok(TokenType.And), tok(TokenType.Space)),
|
|
kleft(tok(TokenType.Or), tok(TokenType.Space)),
|
|
),
|
|
MATCHER,
|
|
),
|
|
(toks, next) => {
|
|
switch (next[0].kind) {
|
|
case TokenType.Space:
|
|
case TokenType.And:
|
|
return {
|
|
text: [toks.text, next[1].text].join(" ").trim(),
|
|
matcher:
|
|
!!toks.matcher || !!next[1].matcher
|
|
? {
|
|
type: "and",
|
|
matchers: [toks.matcher, next[1].matcher].filter(
|
|
(a) => !!a,
|
|
),
|
|
}
|
|
: undefined,
|
|
};
|
|
case TokenType.Or:
|
|
return {
|
|
text: [toks.text, next[1].text].join(" ").trim(),
|
|
matcher:
|
|
!!toks.matcher || !!next[1].matcher
|
|
? {
|
|
type: "or",
|
|
matchers: [toks.matcher, next[1].matcher].filter(
|
|
(a) => !!a,
|
|
),
|
|
}
|
|
: undefined,
|
|
};
|
|
}
|
|
},
|
|
),
|
|
);
|
|
|
|
function flattenAndsAndOrs(matcher: Matcher): Matcher {
|
|
switch (matcher.type) {
|
|
case "and":
|
|
case "or": {
|
|
if (matcher.matchers.length == 1) {
|
|
return flattenAndsAndOrs(matcher.matchers[0]);
|
|
}
|
|
const flattened: Matcher[] = [];
|
|
for (let m of matcher.matchers) {
|
|
// If inside the matcher is another matcher of the same type, flatten it
|
|
m = flattenAndsAndOrs(m);
|
|
if (m.type == matcher.type) {
|
|
flattened.push(...m.matchers);
|
|
} else {
|
|
flattened.push(m);
|
|
}
|
|
}
|
|
matcher.matchers = flattened;
|
|
return matcher;
|
|
}
|
|
default:
|
|
return matcher;
|
|
}
|
|
}
|
|
|
|
export function _parseAndPrintTokens(query: string) {
|
|
console.log(`PARSING: ${query}`);
|
|
let tok = LexerToken.from(query);
|
|
do {
|
|
console.log(tok?.kind, tok?.text);
|
|
tok = tok?.next;
|
|
} while (tok);
|
|
console.log("DONE");
|
|
}
|
|
|
|
function consumeTokenStream(token: Token<TokenType>) {
|
|
let str = "";
|
|
let tok: Token<TokenType> | undefined = token;
|
|
do {
|
|
str += tok.text;
|
|
tok = tok.next;
|
|
} while (tok);
|
|
return str;
|
|
}
|
|
|
|
export function parseSearchQuery(
|
|
query: string,
|
|
): TextAndMatcher & { result: "full" | "partial" | "invalid" } {
|
|
// _parseAndPrintTokens(query); // Uncomment to debug tokenization
|
|
const parsed = EXP.parse(LexerToken.from(query.trim()));
|
|
if (!parsed.successful || parsed.candidates.length != 1) {
|
|
// If the query is not valid, return the whole query as pure text
|
|
return {
|
|
text: query,
|
|
result: "invalid",
|
|
};
|
|
}
|
|
|
|
const parseCandidate = parsed.candidates[0];
|
|
if (parseCandidate.result.matcher) {
|
|
parseCandidate.result.matcher = flattenAndsAndOrs(
|
|
parseCandidate.result.matcher,
|
|
);
|
|
}
|
|
if (parseCandidate.nextToken) {
|
|
// Parser failed to consume the whole query. This usually happen
|
|
// when the user is still typing the query. Return the partial
|
|
// result and the remaining query as pure text
|
|
return {
|
|
text: (
|
|
parseCandidate.result.text +
|
|
consumeTokenStream(parseCandidate.nextToken)
|
|
).trim(),
|
|
matcher: parseCandidate.result.matcher,
|
|
result: "partial",
|
|
};
|
|
}
|
|
|
|
return {
|
|
text: parseCandidate.result.text,
|
|
matcher: parseCandidate.result.matcher,
|
|
result: "full",
|
|
};
|
|
}
|