2021-12-31 21:38:26 +00:00
|
|
|
export type TokenTypes =
|
|
|
|
| "space"
|
|
|
|
| "comment"
|
|
|
|
| "string"
|
|
|
|
| "keyword"
|
2022-01-01 18:47:34 +00:00
|
|
|
| "at"
|
2021-12-31 21:38:26 +00:00
|
|
|
| "colon"
|
|
|
|
| "semicolon"
|
|
|
|
| "comma"
|
|
|
|
| "equals"
|
|
|
|
| "curly_open"
|
|
|
|
| "curly_close"
|
|
|
|
| "bracket_open"
|
|
|
|
| "bracket_close"
|
|
|
|
| "array"
|
|
|
|
| "questionmark"
|
|
|
|
| "number"
|
|
|
|
| "text";
|
|
|
|
|
|
|
|
export type Token = {
|
|
|
|
type: TokenTypes;
|
|
|
|
value: string;
|
|
|
|
startIdx: number;
|
|
|
|
endIdx: number;
|
|
|
|
};
|
|
|
|
|
|
|
|
type Matcher = (input: string, index: number) => undefined | Token;
|
|
|
|
|
|
|
|
export class TokenizerError extends Error {
|
|
|
|
index: number;
|
|
|
|
constructor(message: string, index: number) {
|
|
|
|
super(message);
|
|
|
|
this.index = index;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function regexMatcher(regex: string | RegExp, type: TokenTypes): Matcher {
|
|
|
|
if (typeof regex === "string") regex = new RegExp(regex);
|
|
|
|
|
|
|
|
return (input: string, index: number) => {
|
|
|
|
let matches = input.substring(index).match(regex as RegExp);
|
|
|
|
if (!matches || matches.length <= 0) return undefined;
|
|
|
|
|
|
|
|
return {
|
|
|
|
type,
|
|
|
|
value: matches[0],
|
|
|
|
startIdx: index,
|
|
|
|
endIdx: index + matches[0].length,
|
|
|
|
} as Token;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
const matcher = [
|
|
|
|
regexMatcher(/^\s+/, "space"),
|
|
|
|
regexMatcher(/^(\/\*)(.|\s)*?(\*\/)/g, "comment"),
|
|
|
|
regexMatcher(/^\/\/.+/, "comment"),
|
|
|
|
regexMatcher(/^#.+/, "comment"),
|
|
|
|
regexMatcher(/^".*?"/, "string"),
|
|
|
|
// regexMatcher(/(?<=^")(.*?)(?=")/, "string"),
|
2022-01-05 21:16:17 +00:00
|
|
|
regexMatcher(/^(type|enum|import|service|define)\b/, "keyword"),
|
2022-01-01 18:47:34 +00:00
|
|
|
regexMatcher(/^\@/, "at"),
|
2021-12-31 21:38:26 +00:00
|
|
|
regexMatcher(/^\:/, "colon"),
|
|
|
|
regexMatcher(/^\;/, "semicolon"),
|
|
|
|
regexMatcher(/^\,/, "comma"),
|
|
|
|
regexMatcher(/^\=/, "equals"),
|
|
|
|
regexMatcher(/^{/, "curly_open"),
|
|
|
|
regexMatcher(/^}/, "curly_close"),
|
|
|
|
regexMatcher(/^\(/, "bracket_open"),
|
|
|
|
regexMatcher(/^\)/, "bracket_close"),
|
|
|
|
regexMatcher(/^\[\]/, "array"),
|
|
|
|
regexMatcher(/^\?/, "questionmark"),
|
|
|
|
regexMatcher(/^[\.0-9]+/, "number"),
|
|
|
|
regexMatcher(/^[a-zA-Z_]([a-zA-Z0-9_]?)+/, "text"),
|
|
|
|
];
|
|
|
|
|
|
|
|
export default function tokenize(input: string) {
|
|
|
|
let index = 0;
|
|
|
|
let tokens: Token[] = [];
|
|
|
|
while (index < input.length) {
|
|
|
|
const matches = matcher.map((m) => m(input, index)).filter((e) => !!e);
|
|
|
|
let match = matches[0];
|
|
|
|
if (match) {
|
|
|
|
if (match.type !== "space" && match.type !== "comment") {
|
|
|
|
tokens.push(match);
|
|
|
|
}
|
|
|
|
index += match.value.length;
|
|
|
|
} else {
|
|
|
|
throw new TokenizerError(
|
|
|
|
`Unexpected token '${input.substring(index, index + 1)}'`,
|
|
|
|
index
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return tokens;
|
|
|
|
}
|