Hi, yet another question regarding custom languages (potentially more of a Grammar-Kit / JFlex question than platform, feel free to close if not fitting)
I’m having a working grammar and lexer for my language, which works so far - example being:
TextField {
width: 100%;
margin: 2 40 5 5;
color: mix-color(1, 2, 3);
}
resulting in
my PSI tree
My Custom Stylesheet(0,77)
LssDeclarationBlockImpl(DECLARATION_BLOCK)(0,77)
LssSelectorGroupImpl(SELECTOR_GROUP)(0,9)
LssSelectorImpl(SELECTOR)(0,9)
LssWidgetSelectorImpl(WIDGET_SELECTOR)(0,9)
LssIdentifierImpl(IDENTIFIER)(0,9)
PsiElement(LssTokenType.IDENTIFIER)('TextField')(0,9)
PsiWhiteSpace(' ')(9,10)
PsiElement(LssTokenType.{)('{')(10,11)
PsiWhiteSpace('\n ')(11,14)
LssDeclarationImpl(DECLARATION)(14,26)
LssPropertyImpl(PROPERTY)(14,25)
LssIdentifierImpl(IDENTIFIER)(14,19)
PsiElement(LssTokenType.IDENTIFIER)('width')(14,19)
PsiElement(LssTokenType.:)(':')(19,20)
PsiWhiteSpace(' ')(20,21)
LssValueListImpl(VALUE_LIST)(21,25)
LssValueImpl(VALUE)(21,25)
LssNumberImpl(NUMBER)(21,25)
PsiElement(LssTokenType.NUMBER)('100')(21,24)
PsiElement(LssTokenType.%)('%')(24,25)
PsiElement(LssTokenType.;)(';')(25,26)
PsiWhiteSpace('\n ')(26,29)
LssDeclarationImpl(DECLARATION)(29,46)
LssPropertyImpl(PROPERTY)(29,45)
LssIdentifierImpl(IDENTIFIER)(29,35)
PsiElement(LssTokenType.IDENTIFIER)('margin')(29,35)
PsiElement(LssTokenType.:)(':')(35,36)
PsiWhiteSpace(' ')(36,37)
LssValueListImpl(VALUE_LIST)(37,45)
LssValueImpl(VALUE)(37,38)
LssNumberImpl(NUMBER)(37,38)
PsiElement(LssTokenType.NUMBER)('2')(37,38)
PsiElement(LssTokenType.WHITE_SPACE)(' ')(38,39)
LssValueImpl(VALUE)(39,41)
LssNumberImpl(NUMBER)(39,41)
PsiElement(LssTokenType.NUMBER)('40')(39,41)
PsiElement(LssTokenType.WHITE_SPACE)(' ')(41,42)
LssValueImpl(VALUE)(42,43)
LssNumberImpl(NUMBER)(42,43)
PsiElement(LssTokenType.NUMBER)('5')(42,43)
PsiElement(LssTokenType.WHITE_SPACE)(' ')(43,44)
LssValueImpl(VALUE)(44,45)
LssNumberImpl(NUMBER)(44,45)
PsiElement(LssTokenType.NUMBER)('5')(44,45)
PsiElement(LssTokenType.;)(';')(45,46)
PsiWhiteSpace('\n ')(46,49)
LssDeclarationImpl(DECLARATION)(49,75)
LssPropertyImpl(PROPERTY)(49,74)
LssIdentifierImpl(IDENTIFIER)(49,54)
PsiElement(LssTokenType.IDENTIFIER)('color')(49,54)
PsiElement(LssTokenType.:)(':')(54,55)
PsiWhiteSpace(' ')(55,56)
LssValueListImpl(VALUE_LIST)(56,74)
LssValueImpl(VALUE)(56,74)
LssFunctionImpl(FUNCTION)(56,74)
PsiElement(LssTokenType.IDENTIFIER)('mix-color')(56,65)
PsiElement(LssTokenType.()('(')(65,66)
LssValueImpl(VALUE)(66,67)
LssNumberImpl(NUMBER)(66,67)
PsiElement(LssTokenType.NUMBER)('1')(66,67)
PsiElement(LssTokenType.,)(',')(67,68)
PsiWhiteSpace(' ')(68,69)
LssValueImpl(VALUE)(69,70)
LssNumberImpl(NUMBER)(69,70)
PsiElement(LssTokenType.NUMBER)('2')(69,70)
PsiElement(LssTokenType.,)(',')(70,71)
PsiWhiteSpace(' ')(71,72)
LssValueImpl(VALUE)(72,73)
LssNumberImpl(NUMBER)(72,73)
PsiElement(LssTokenType.NUMBER)('3')(72,73)
PsiElement(LssTokenType.))(')')(73,74)
PsiElement(LssTokenType.;)(';')(74,75)
PsiElement(LssTokenType.})('\n}')(75,77)
The issue I’m having is, that the language is extremely flexible in terms of what is allowed as a value. It generally allows everything, which is being post processed (basically, everything is parsed as a string, trimmed, split at whitespace and processed). Most of the builtin stuff I can kinda represent in my grammar, but it’s possible, that other value types are added by third party plugins in runtime, which I can’t control (maybe something like other-rule: 12 * 4 - 3
- which wouldn’t be supported by default, but could be added).
My goal was to add a fallback, if no value can be parsed, that whatever is there is parsed as a RAW
token type (just so it actually passes the parsing). This would be annotated with a warning and could be suppressed / added as a custom value type. But I wasn’t really able to add a fallback-kinda thing to my lexer. Would this be rather solved with a custom parsing rule in a ParserUtil? If so, is there any good example? Or are there better ways? Should I handle the parsing of all value types directly in a custom parser rule?
My current grammar and lexer:
Grammar
lssFile ::= declaration_block*
/* Partly values, but used in other places as well */
identifier ::= IDENT
string ::= QUOTE STRING_CONTENT? QUOTE
/* Block Selector Rules */
widget_selector ::= identifier
id_selector ::= PERIOD identifier
selector_group ::= selector (COMMA selector)*
selector ::= widget_selector | id_selector
/* Term Values */
value_list ::= value (WHITE_SPACE value)*
value ::= string | number | function | identifier /* fallback? */
function ::= IDENT LPAREN (value (COMMA value)*)? RPAREN
number ::= NUM (PERCENTAGE_SYM)?
/* Property Block(s) */
declaration_block ::= selector_group LCURLY (declaration_block | declaration)* RCURLY
declaration ::= property SEMICOLON
property ::= identifier COLON value_list
Lexer
import com.intellij.psi.TokenType;
import com.intellij.psi.tree.IElementType;
import ....LssTypes;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntStack;
%%
%class LssLexer
%implements com.intellij.lexer.FlexLexer
%unicode
%function advance
%type IElementType
%{
public LssLexer() {
this(null);
}
private final IntStack stateStack = new IntArrayList(1000);
private int parantheses = 0; // current count of non-closed parantheses
private int blocks = 0; // current (nested) block depth
private boolean inBlock() {
return this.blocks > 0;
}
private void enterBlockAndPushState() {
blocks++;
pushState(IN_BLOCK_BODY);
}
private void leaveBlockAndPop() {
blocks--;
popState(IN_BLOCK_BODY);
}
private void pushState(int state) {
assert yystate() != YYINITIAL || stateStack.isEmpty() : "Attempted to push initial state into non-empty stack";
this.stateStack.push(yystate());
yybegin(state);
}
private void popState() {
assert !stateStack.isEmpty() : "State stack is empty";
yybegin(stateStack.popInt());
}
private void popState(int state) {
if (yystate() == state) this.popState();
}
private void yypushbackWhileChar(char ch) {
if (yylength() < 1) return;
for(int i = 0; i < yylength() - 2; i++) {
if (yycharat(yylength() - i) == ch) continue;
yypushback(i);
return;
}
yypushback(yylength());
}
%}
SingleWhiteSpace = " "
Whitespace = [ \n\t\f]+
At = "@"
Colon = ":"
Semicolon = ";"
Comma = ","
Quote = \"
LCurly = "{"
RCurly = "}"
LParen = "("
RParen = ")"
Period = "."
StringContent = ([^\"] | "\\" [^\n] | ("\\" (\r\n|\r|\n)))+
Number = [0-9]+
Identifier = [a-zA-Z](([a-zA-Z-]*)?[a-zA-Z])?
InlineComment = "/*"[^]*"*/"
SuperSym = "super()"
PercentageSym = "%"
%state IN_STRING
%state IN_BLOCK_BODY
%state PREPARE_FOR_VALUE
%state IN_VALUE
%state IN_FUNCTION
%%
<YYINITIAL, IN_BLOCK_BODY> {
{Period} { return LssTypes.PERIOD; }
{Identifier} { return LssTypes.IDENT; }
{LCurly} { enterBlockAndPushState(); return LssTypes.LCURLY; }
}
<IN_BLOCK_BODY> {
{RCurly} { if (!inBlock()) { return TokenType.BAD_CHARACTER; } leaveBlockAndPop(); return LssTypes.RCURLY; }
{Colon} { pushState(PREPARE_FOR_VALUE); return LssTypes.COLON; }
}
// awkward intermediate state, after a property identifier (`color:`) and before the actual value (`green`).
// used to discard whitespaces before the first actual value (as whitespaces are separators).
<PREPARE_FOR_VALUE> {
{Whitespace}* { return TokenType.WHITE_SPACE; }
[^] { yypushback(1); popState(); pushState(IN_VALUE); }
}
<IN_VALUE> {
{Semicolon} { popState(); return LssTypes.SEMICOLON; }
{Whitespace} { return LssTypes.WHITE_SPACE; }
}
<IN_VALUE, IN_FUNCTION> {
{Number} { return LssTypes.NUM; }
{PercentageSym} { return LssTypes.PERCENTAGE_SYM; }
{Identifier} { return LssTypes.IDENT; }
{LParen} { pushState(IN_FUNCTION); return LssTypes.LPAREN; }
}
<IN_FUNCTION> {
{LParen} { return LssTypes.LPAREN; }
{RParen} { popState(); return LssTypes.RPAREN; }
}
<IN_STRING> {
{StringContent} { return LssTypes.STRING_CONTENT; }
{Quote} { popState(); return LssTypes.QUOTE; }
}
{Comma} { return LssTypes.COMMA; }
{Quote} { pushState(IN_STRING); return LssTypes.QUOTE; }
[\n\r] { /* ignore LB / CR (for now?) */ }
{Whitespace} { return TokenType.WHITE_SPACE; }
[^] { return TokenType.BAD_CHARACTER; }