mirror of
https://github.com/carlrobertoh/ProxyAI.git
synced 2026-05-12 22:31:24 +00:00
You API integration (#203)
* Ability to configure custom service * Add example preset templates, rename module * Custom service client impl * Add YOU API integration * Remove/ignore generated antlr classes * Remove text completion models(deprecated) * Remove unused code, fix settings state sync * Display model name/icon in the tool window * Update chat history UI * Fix model/service sync * Clear plugin state * Fix minor bugs, add settings sync tests * UI changes * Separate model configuration * Add support for overriding the completion path * Update Find Bugs prompt
This commit is contained in:
parent
a860054360
commit
37af74ebdf
125 changed files with 1673 additions and 1537 deletions
|
|
@ -0,0 +1,47 @@
|
|||
package ee.carlrobert.embedding;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class CheckedFile {
|
||||
|
||||
private final String fileName;
|
||||
private final String filePath;
|
||||
private final String fileContent;
|
||||
|
||||
public CheckedFile(File file) {
|
||||
this.fileName = file.getName();
|
||||
this.filePath = file.getPath();
|
||||
try {
|
||||
this.fileContent = new String(Files.readAllBytes(Paths.get(filePath)));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public String getFileName() {
|
||||
return fileName;
|
||||
}
|
||||
|
||||
public String getFilePath() {
|
||||
return filePath;
|
||||
}
|
||||
|
||||
public String getFileContent() {
|
||||
return fileContent;
|
||||
}
|
||||
|
||||
public String getFileExtension() {
|
||||
Pattern pattern = Pattern.compile("[^.]+$");
|
||||
Matcher matcher = pattern.matcher(fileName);
|
||||
|
||||
if (matcher.find()) {
|
||||
return matcher.group();
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
package ee.carlrobert.embedding;
|
||||
|
||||
import static com.github.jelmerk.knn.util.VectorUtils.normalize;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.github.jelmerk.knn.Item;
|
||||
import com.github.jelmerk.knn.SearchResult;
|
||||
import com.intellij.openapi.diagnostic.Logger;
|
||||
import com.intellij.openapi.progress.ProgressIndicator;
|
||||
import ee.carlrobert.llm.client.openai.OpenAIClient;
|
||||
import ee.carlrobert.llm.client.openai.completion.chat.OpenAIChatCompletionModel;
|
||||
import ee.carlrobert.llm.client.openai.completion.chat.request.OpenAIChatCompletionMessage;
|
||||
import ee.carlrobert.llm.client.openai.completion.chat.request.OpenAIChatCompletionRequest;
|
||||
import ee.carlrobert.splitter.SplitterFactory;
|
||||
import ee.carlrobert.vector.VectorStore;
|
||||
import ee.carlrobert.vector.Word;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
public class EmbeddingsService {
|
||||
|
||||
private static final Logger LOG = Logger.getInstance(EmbeddingsService.class);
|
||||
|
||||
private final VectorStore vectorStore;
|
||||
private final OpenAIClient openAIClient;
|
||||
|
||||
public EmbeddingsService(OpenAIClient openAIClient, Path pluginBasePath) {
|
||||
this.openAIClient = openAIClient;
|
||||
this.vectorStore = VectorStore.getInstance(pluginBasePath);
|
||||
}
|
||||
|
||||
public List<double[]> getEmbeddings(List<String> chunks) {
|
||||
return openAIClient.getEmbeddings(chunks);
|
||||
}
|
||||
|
||||
public String buildPromptWithContext(String prompt) {
|
||||
try {
|
||||
var inputEmbedding = openAIClient.getEmbedding(getSearchQuery(prompt));
|
||||
var sortedResult = vectorStore.loadIndex()
|
||||
.findNearest(normalize(inputEmbedding), 10)
|
||||
.stream()
|
||||
.map(SearchResult::item)
|
||||
.sorted(Comparator.comparing(Word::getMeta))
|
||||
.collect(toList());
|
||||
|
||||
var context = sortedResult.stream().map(Word::id).collect(Collectors.joining());
|
||||
var fileNames = sortedResult.stream().map(Word::getMeta).collect(Collectors.toSet());
|
||||
|
||||
return getResourceContent("/prompts/prompt-with-context.txt")
|
||||
.replace("{prompt}", prompt)
|
||||
.replace("{context}", new GeneratedContextDetails(context, fileNames).getContext());
|
||||
} catch (IOException e) {
|
||||
LOG.error("Unable to load vector index", e);
|
||||
return prompt;
|
||||
}
|
||||
}
|
||||
|
||||
public List<Item<Object, double[]>> createEmbeddings(List<CheckedFile> checkedFiles, @Nullable ProgressIndicator indicator) {
|
||||
var words = new ArrayList<Item<Object, double[]>>();
|
||||
for (int i = 0; i < checkedFiles.size(); i++) {
|
||||
try {
|
||||
var checkedFile = checkedFiles.get(i);
|
||||
addEmbeddings(checkedFile, words);
|
||||
|
||||
if (indicator != null) {
|
||||
indicator.setFraction((double) i / checkedFiles.size());
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
private String getSearchQuery(String userPrompt) throws JsonProcessingException {
|
||||
var message = new OpenAIChatCompletionMessage("user", getResourceContent("/prompts/text-generator.txt").replace("{prompt}", userPrompt));
|
||||
var request = new OpenAIChatCompletionRequest.Builder(List.of(message))
|
||||
.setModel(OpenAIChatCompletionModel.GPT_4)
|
||||
.setMaxTokens(400)
|
||||
.setTemperature(0.1)
|
||||
.setStream(false)
|
||||
.build();
|
||||
|
||||
return openAIClient.getChatCompletion(request)
|
||||
.getChoices()
|
||||
.get(0)
|
||||
.getMessage()
|
||||
.getContent();
|
||||
}
|
||||
|
||||
private void addEmbeddings(CheckedFile checkedFile, List<Item<Object, double[]>> prevEmbeddings) {
|
||||
var fileExtension = checkedFile.getFileExtension();
|
||||
var codeSplitter = SplitterFactory.getCodeSplitter(fileExtension);
|
||||
if (codeSplitter != null) {
|
||||
var chunks = codeSplitter.split(checkedFile.getFileName(), checkedFile.getFileContent());
|
||||
var embeddings = openAIClient.getEmbeddings(chunks);
|
||||
for (int i = 0; i < chunks.size(); i++) {
|
||||
prevEmbeddings.add(new Word(chunks.get(i), checkedFile.getFileName(), normalize(embeddings.get(i))));
|
||||
}
|
||||
} else {
|
||||
var chunks = splitText(checkedFile.getFileContent(), 400);
|
||||
var embeddings = getEmbeddings(chunks);
|
||||
for (int i = 0; i < chunks.size(); i++) {
|
||||
prevEmbeddings.add(new Word(chunks.get(i), checkedFile.getFileName(), normalize(embeddings.get(i))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> splitText(String str, int chunkSize) {
|
||||
int len = str.length();
|
||||
var chunks = new ArrayList<String>();
|
||||
for (int i = 0; i < len; i += chunkSize) {
|
||||
chunks.add(str.substring(i, Math.min(len, i + chunkSize)));
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// TODO: Move to shared module
|
||||
private static String getResourceContent(String name) {
|
||||
try (var stream = Objects.requireNonNull(EmbeddingsService.class.getResourceAsStream(name))) {
|
||||
return new String(stream.readAllBytes(), StandardCharsets.UTF_8);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to read resource", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package ee.carlrobert.embedding;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public class GeneratedContextDetails {
|
||||
|
||||
private final String context;
|
||||
private final Set<String> fileNames;
|
||||
|
||||
public GeneratedContextDetails(String context, Set<String> fileNames) {
|
||||
this.context = context;
|
||||
this.fileNames = fileNames;
|
||||
}
|
||||
|
||||
public String getContext() {
|
||||
return context;
|
||||
}
|
||||
|
||||
public Set<String> getFileNames() {
|
||||
return fileNames;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.antlr.v4.runtime.CharStreams;
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
import org.antlr.v4.runtime.ParserRuleContext;
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
import org.antlr.v4.runtime.tree.ParseTree;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeListener;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeWalker;
|
||||
|
||||
abstract class CodeSplitter implements Splitter {
|
||||
|
||||
protected List<String> chunks = new ArrayList<>();
|
||||
|
||||
protected abstract ParseTree getParseTree(CodePointCharStream charStream);
|
||||
|
||||
protected abstract ParseTreeListener getParseTreeListener();
|
||||
|
||||
protected String parseContext(ParserRuleContext ctx) {
|
||||
return ctx.start.getInputStream().getText(
|
||||
new Interval(ctx.start.getStartIndex(), ctx.stop.getStopIndex()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> split(String fileName, String content) {
|
||||
chunks = new ArrayList<>();
|
||||
ParseTreeWalker.DEFAULT.walk(getParseTreeListener(), getParseTree(CharStreams.fromString(content)));
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import grammar.JavaLexer;
|
||||
import grammar.JavaParser;
|
||||
import grammar.JavaParserBaseListener;
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
import org.antlr.v4.runtime.CommonTokenStream;
|
||||
import org.antlr.v4.runtime.tree.ParseTree;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeListener;
|
||||
|
||||
public class JavaCodeSplitter extends CodeSplitter {
|
||||
|
||||
@Override
|
||||
protected ParseTree getParseTree(CodePointCharStream charStream) {
|
||||
return new JavaParser(new CommonTokenStream(new JavaLexer(charStream))).compilationUnit();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseTreeListener getParseTreeListener() {
|
||||
return new JavaParserBaseListener() {
|
||||
@Override
|
||||
public void enterConstructorDeclaration(JavaParser.ConstructorDeclarationContext ctx) {
|
||||
chunks.add(parseContext(ctx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterMethodDeclaration(JavaParser.MethodDeclarationContext ctx) {
|
||||
chunks.add(parseContext(ctx));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.intellij.openapi.diagnostic.Logger;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.json.JSONObject;
|
||||
|
||||
public class JsonSplitter implements Splitter {
|
||||
|
||||
private static final Logger LOG = Logger.getInstance(JsonSplitter.class);
|
||||
|
||||
@Override
|
||||
public List<String> split(String fileName, String content) {
|
||||
var chunks = new ArrayList<String>();
|
||||
|
||||
try {
|
||||
// TODO: Switch to ObjectMapper
|
||||
for (var entry : new JSONObject(content).toMap().entrySet()) {
|
||||
chunks.add(new ObjectMapper().writeValueAsString(entry));
|
||||
}
|
||||
} catch (JsonProcessingException e) {
|
||||
LOG.error("Something went wrong while chunking the json", e);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import grammar.PythonLexer;
|
||||
import grammar.PythonParser;
|
||||
import grammar.PythonParserBaseListener;
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
import org.antlr.v4.runtime.CommonTokenStream;
|
||||
import org.antlr.v4.runtime.tree.ParseTree;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeListener;
|
||||
|
||||
public class PythonCodeSplitter extends CodeSplitter {
|
||||
|
||||
@Override
|
||||
protected ParseTree getParseTree(CodePointCharStream charStream) {
|
||||
return new PythonParser(new CommonTokenStream(new PythonLexer(charStream))).file_input();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseTreeListener getParseTreeListener() {
|
||||
return new PythonParserBaseListener() {
|
||||
@Override
|
||||
public void enterClass_or_func_def_stmt(PythonParser.Class_or_func_def_stmtContext ctx) {
|
||||
chunks.add(parseContext(ctx));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface Splitter {
|
||||
|
||||
List<String> split(String fileName, String content);
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
public class SplitterFactory {
|
||||
|
||||
public static @Nullable Splitter getCodeSplitter(String fileExtension) {
|
||||
switch (fileExtension) {
|
||||
case "java":
|
||||
return new JavaCodeSplitter();
|
||||
case "py":
|
||||
return new PythonCodeSplitter();
|
||||
case "json":
|
||||
return new JsonSplitter();
|
||||
case "ts":
|
||||
case "tsx":
|
||||
return new TypeScriptCodeSplitter();
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package ee.carlrobert.splitter;
|
||||
|
||||
import grammar.TypeScriptLexer;
|
||||
import grammar.TypeScriptParser;
|
||||
import grammar.TypeScriptParserBaseListener;
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
import org.antlr.v4.runtime.CommonTokenStream;
|
||||
import org.antlr.v4.runtime.tree.ParseTree;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeListener;
|
||||
|
||||
public class TypeScriptCodeSplitter extends CodeSplitter {
|
||||
|
||||
@Override
|
||||
protected ParseTree getParseTree(CodePointCharStream charStream) {
|
||||
return new TypeScriptParser(new CommonTokenStream(new TypeScriptLexer(charStream))).program();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseTreeListener getParseTreeListener() {
|
||||
return new TypeScriptParserBaseListener() {};
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
package ee.carlrobert.vector;
|
||||
|
||||
import com.github.jelmerk.knn.DistanceFunctions;
|
||||
import com.github.jelmerk.knn.Item;
|
||||
import com.github.jelmerk.knn.hnsw.HnswIndex;
|
||||
import com.intellij.openapi.application.ApplicationManager;
|
||||
import com.intellij.openapi.util.io.FileUtil;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class VectorStore {
|
||||
|
||||
private static VectorStore instance;
|
||||
|
||||
private final String storePath;
|
||||
|
||||
private VectorStore(Path pluginPath) {
|
||||
this.storePath = getIndexStorePath(pluginPath.toString());
|
||||
}
|
||||
|
||||
public static VectorStore getInstance(Path pluginPath) {
|
||||
if (instance == null) {
|
||||
instance = new VectorStore(pluginPath);
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
public HnswIndex<Object, double[], Word, Object> loadIndex() throws IOException {
|
||||
return loadIndex(storePath);
|
||||
}
|
||||
|
||||
public HnswIndex<Object, double[], Word, Object> loadIndex(String path) throws IOException {
|
||||
return HnswIndex.load(new File(path), this.getClass().getClassLoader());
|
||||
}
|
||||
|
||||
public void save(List<Item<Object, double[]>> words) {
|
||||
var hnswIndex = HnswIndex
|
||||
.newBuilder(words.get(0).vector().length, DistanceFunctions.DOUBLE_COSINE_DISTANCE, words.size())
|
||||
.build();
|
||||
try {
|
||||
hnswIndex.addAll(words);
|
||||
hnswIndex.save(new File(storePath));
|
||||
} catch (IOException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isIndexExists() {
|
||||
return FileUtil.exists(storePath);
|
||||
}
|
||||
|
||||
private String getIndexStorePath(String pluginBasePath) {
|
||||
if (ApplicationManager.getApplication().isUnitTestMode()) {
|
||||
pluginBasePath = new File("src/test/resources/indexes").getAbsolutePath();
|
||||
}
|
||||
return pluginBasePath + File.separator + "hnsw.index";
|
||||
}
|
||||
}
|
||||
46
codegpt-core/src/main/java/ee/carlrobert/vector/Word.java
Normal file
46
codegpt-core/src/main/java/ee/carlrobert/vector/Word.java
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
package ee.carlrobert.vector;
|
||||
|
||||
import com.github.jelmerk.knn.Item;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class Word implements Item<Object, double[]> {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private final String id;
|
||||
private final String meta;
|
||||
private final double[] vector;
|
||||
|
||||
public Word(String id, String meta, double[] vector) {
|
||||
this.id = id;
|
||||
this.meta = meta;
|
||||
this.vector = vector;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String id() {
|
||||
return id;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double[] vector() {
|
||||
return vector;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int dimensions() {
|
||||
return vector.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Word{" +
|
||||
"id='" + id + '\'' +
|
||||
", vector=" + Arrays.toString(vector) +
|
||||
'}';
|
||||
}
|
||||
|
||||
public String getMeta() {
|
||||
return meta;
|
||||
}
|
||||
}
|
||||
184
codegpt-core/src/main/java/grammar/PythonLexerBase.java
Normal file
184
codegpt-core/src/main/java/grammar/PythonLexerBase.java
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
package grammar;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Deque;
|
||||
import org.antlr.v4.runtime.CharStream;
|
||||
import org.antlr.v4.runtime.CommonToken;
|
||||
import org.antlr.v4.runtime.Lexer;
|
||||
import org.antlr.v4.runtime.Token;
|
||||
|
||||
public abstract class PythonLexerBase extends Lexer {
|
||||
public static int TabSize = 8;
|
||||
|
||||
// The amount of opened braces, brackets and parenthesis.
|
||||
private int _opened;
|
||||
|
||||
// The stack that keeps track of the indentation level.
|
||||
private final Deque<Integer> _indents = new ArrayDeque<>();
|
||||
|
||||
// A circular buffer where extra tokens are pushed on (see the NEWLINE and WS lexer rules).
|
||||
private int _firstTokensInd;
|
||||
private int _lastTokenInd;
|
||||
private Token[] _buffer = new Token[32];
|
||||
private Token _lastToken;
|
||||
|
||||
protected PythonLexerBase(CharStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void emit(Token token) {
|
||||
super.setToken(token);
|
||||
|
||||
if (_buffer[_firstTokensInd] != null)
|
||||
{
|
||||
_lastTokenInd = IncTokenInd(_lastTokenInd);
|
||||
|
||||
if (_lastTokenInd == _firstTokensInd)
|
||||
{
|
||||
// Enlarge buffer
|
||||
Token[] newArray = new Token[_buffer.length * 2];
|
||||
int destInd = newArray.length - (_buffer.length - _firstTokensInd);
|
||||
|
||||
System.arraycopy(_buffer, 0, newArray, 0, _firstTokensInd);
|
||||
System.arraycopy(_buffer, _firstTokensInd, newArray, destInd, _buffer.length - _firstTokensInd);
|
||||
|
||||
_firstTokensInd = destInd;
|
||||
_buffer = newArray;
|
||||
}
|
||||
}
|
||||
|
||||
_buffer[_lastTokenInd] = token;
|
||||
_lastToken = token;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Token nextToken() {
|
||||
// Check if the end-of-file is ahead and there are still some DEDENTS expected.
|
||||
if (_input.LA(1) == EOF && _indents.size() > 0)
|
||||
{
|
||||
if (_buffer[_lastTokenInd] == null || _buffer[_lastTokenInd].getType() != PythonLexer.LINE_BREAK)
|
||||
{
|
||||
// First emit an extra line break that serves as the end of the statement.
|
||||
emit(PythonLexer.LINE_BREAK);
|
||||
}
|
||||
|
||||
// Now emit as much DEDENT tokens as needed.
|
||||
while (_indents.size() != 0)
|
||||
{
|
||||
emit(PythonLexer.DEDENT);
|
||||
_indents.pop();
|
||||
}
|
||||
}
|
||||
|
||||
Token next = super.nextToken();
|
||||
|
||||
if (_buffer[_firstTokensInd] == null)
|
||||
{
|
||||
return next;
|
||||
}
|
||||
|
||||
Token result = _buffer[_firstTokensInd];
|
||||
_buffer[_firstTokensInd] = null;
|
||||
|
||||
if (_firstTokensInd != _lastTokenInd)
|
||||
{
|
||||
_firstTokensInd = IncTokenInd(_firstTokensInd);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected void HandleNewLine() {
|
||||
emit(PythonLexer.NEWLINE, HIDDEN, getText());
|
||||
|
||||
char next = (char) _input.LA(1);
|
||||
|
||||
// Process whitespaces in HandleSpaces
|
||||
if (next != ' ' && next != '\t' && IsNotNewLineOrComment(next))
|
||||
{
|
||||
ProcessNewLine(0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void HandleSpaces() {
|
||||
char next = (char) _input.LA(1);
|
||||
|
||||
if ((_lastToken == null || _lastToken.getType() == PythonLexer.NEWLINE) && IsNotNewLineOrComment(next))
|
||||
{
|
||||
// Calculates the indentation of the provided spaces, taking the
|
||||
// following rules into account:
|
||||
//
|
||||
// "Tabs are replaced (from left to right) by one to eight spaces
|
||||
// such that the total number of characters up to and including
|
||||
// the replacement is a multiple of eight [...]"
|
||||
//
|
||||
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
|
||||
|
||||
int indent = 0;
|
||||
String text = getText();
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
indent += text.charAt(i) == '\t' ? TabSize - indent % TabSize : 1;
|
||||
}
|
||||
|
||||
ProcessNewLine(indent);
|
||||
}
|
||||
|
||||
emit(PythonLexer.WS, HIDDEN, getText());
|
||||
}
|
||||
|
||||
protected void IncIndentLevel() {
|
||||
_opened++;
|
||||
}
|
||||
|
||||
protected void DecIndentLevel() {
|
||||
if (_opened > 0) {
|
||||
--_opened;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean IsNotNewLineOrComment(char next) {
|
||||
return _opened == 0 && next != '\r' && next != '\n' && next != '\f' && next != '#';
|
||||
}
|
||||
|
||||
private void ProcessNewLine(int indent) {
|
||||
emit(PythonLexer.LINE_BREAK);
|
||||
|
||||
int previous = _indents.size() == 0 ? 0 : _indents.peek();
|
||||
|
||||
if (indent > previous)
|
||||
{
|
||||
_indents.push(indent);
|
||||
emit(PythonLexer.INDENT);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Possibly emit more than 1 DEDENT token.
|
||||
while (_indents.size() != 0 && _indents.peek() > indent)
|
||||
{
|
||||
emit(PythonLexer.DEDENT);
|
||||
_indents.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int IncTokenInd(int ind) {
|
||||
return (ind + 1) % _buffer.length;
|
||||
}
|
||||
|
||||
private void emit(int tokenType) {
|
||||
emit(tokenType, DEFAULT_TOKEN_CHANNEL, "");
|
||||
}
|
||||
|
||||
private void emit(int tokenType, int channel, String text) {
|
||||
int charIndex = getCharIndex();
|
||||
CommonToken token = new CommonToken(_tokenFactorySourcePair, tokenType, channel, charIndex - text.length(), charIndex - 1);
|
||||
token.setLine(getLine());
|
||||
token.setCharPositionInLine(getCharPositionInLine());
|
||||
token.setText(text);
|
||||
|
||||
emit(token);
|
||||
}
|
||||
}
|
||||
|
||||
26
codegpt-core/src/main/java/grammar/PythonParserBase.java
Normal file
26
codegpt-core/src/main/java/grammar/PythonParserBase.java
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
package grammar;
|
||||
|
||||
import org.antlr.v4.runtime.Parser;
|
||||
import org.antlr.v4.runtime.TokenStream;
|
||||
|
||||
public abstract class PythonParserBase extends Parser
|
||||
{
|
||||
public PythonVersion Version = PythonVersion.Autodetect;
|
||||
|
||||
protected PythonParserBase(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
protected boolean CheckVersion(int version) {
|
||||
return Version == PythonVersion.Autodetect || version == Version.getValue();
|
||||
}
|
||||
|
||||
protected void SetVersion(int requiredVersion) {
|
||||
if (requiredVersion == 2) {
|
||||
Version = PythonVersion.Python2;
|
||||
} else if (requiredVersion == 3) {
|
||||
Version = PythonVersion.Python3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
17
codegpt-core/src/main/java/grammar/PythonVersion.java
Normal file
17
codegpt-core/src/main/java/grammar/PythonVersion.java
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
package grammar;
|
||||
|
||||
public enum PythonVersion {
|
||||
Autodetect(0),
|
||||
Python2(2),
|
||||
Python3(3);
|
||||
|
||||
private final int value;
|
||||
|
||||
PythonVersion(int value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public int getValue() {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
166
codegpt-core/src/main/java/grammar/TypeScriptLexerBase.java
Normal file
166
codegpt-core/src/main/java/grammar/TypeScriptLexerBase.java
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
package grammar;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Deque;
|
||||
import org.antlr.v4.runtime.CharStream;
|
||||
import org.antlr.v4.runtime.Lexer;
|
||||
import org.antlr.v4.runtime.Token;
|
||||
|
||||
/**
|
||||
* All lexer methods that used in grammar (IsStrictMode)
|
||||
* should start with Upper Case Char similar to Lexer rules.
|
||||
*/
|
||||
public abstract class TypeScriptLexerBase extends Lexer
|
||||
{
|
||||
/**
|
||||
* Stores values of nested modes. By default mode is strict or
|
||||
* defined externally (useStrictDefault)
|
||||
*/
|
||||
private final Deque<Boolean> scopeStrictModes = new ArrayDeque<>();
|
||||
|
||||
private Token lastToken = null;
|
||||
/**
|
||||
* Default value of strict mode
|
||||
* Can be defined externally by setUseStrictDefault
|
||||
*/
|
||||
private boolean useStrictDefault = false;
|
||||
/**
|
||||
* Current value of strict mode
|
||||
* Can be defined during parsing, see StringFunctions.js and StringGlobal.js samples
|
||||
*/
|
||||
private boolean useStrictCurrent = false;
|
||||
/**
|
||||
* Keeps track of the current depth of nested template string backticks.
|
||||
* E.g. after the X in:
|
||||
*
|
||||
* `${a ? `${X
|
||||
*
|
||||
* templateDepth will be 2. This variable is needed to determine if a `}` is a
|
||||
* plain CloseBrace, or one that closes an expression inside a template string.
|
||||
*/
|
||||
private int templateDepth = 0;
|
||||
|
||||
/**
|
||||
* Keeps track of the depth of open- and close-braces. Used for expressions like:
|
||||
*
|
||||
* `${[1, 2, 3].map(x => { return x * 2;}).join("")}`
|
||||
*
|
||||
* where the '}' from `return x * 2;}` should not become a `TemplateCloseBrace`
|
||||
* token but rather a `CloseBrace` token.
|
||||
*/
|
||||
private int bracesDepth = 0;
|
||||
|
||||
public TypeScriptLexerBase(CharStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public boolean getStrictDefault() {
|
||||
return useStrictDefault;
|
||||
}
|
||||
|
||||
public void setUseStrictDefault(boolean value) {
|
||||
useStrictDefault = value;
|
||||
useStrictCurrent = value;
|
||||
}
|
||||
|
||||
public boolean IsStrictMode() {
|
||||
return useStrictCurrent;
|
||||
}
|
||||
|
||||
public void StartTemplateString() {
|
||||
this.bracesDepth = 0;
|
||||
}
|
||||
|
||||
public boolean IsInTemplateString() {
|
||||
return this.templateDepth > 0 && this.bracesDepth == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the next token from the character stream and records this last
|
||||
* token in case it resides on the default channel. This recorded token
|
||||
* is used to determine when the lexer could possibly match a regex
|
||||
* literal. Also changes scopeStrictModes stack if tokenize special
|
||||
* string 'use strict';
|
||||
*
|
||||
* @return the next token from the character stream.
|
||||
*/
|
||||
@Override
|
||||
public Token nextToken() {
|
||||
Token next = super.nextToken();
|
||||
|
||||
if (next.getChannel() == Token.DEFAULT_CHANNEL) {
|
||||
// Keep track of the last token on the default channel.
|
||||
this.lastToken = next;
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
protected void ProcessOpenBrace()
|
||||
{
|
||||
bracesDepth++;
|
||||
useStrictCurrent = scopeStrictModes.size() > 0 && scopeStrictModes.peek() ? true : useStrictDefault;
|
||||
scopeStrictModes.push(useStrictCurrent);
|
||||
}
|
||||
|
||||
protected void ProcessCloseBrace()
|
||||
{
|
||||
bracesDepth--;
|
||||
useStrictCurrent = scopeStrictModes.size() > 0 ? scopeStrictModes.pop() : useStrictDefault;
|
||||
}
|
||||
|
||||
protected void ProcessStringLiteral()
|
||||
{
|
||||
if (lastToken == null || lastToken.getType() == TypeScriptLexer.OpenBrace)
|
||||
{
|
||||
String text = getText();
|
||||
if (text.equals("\"use strict\"") || text.equals("'use strict'"))
|
||||
{
|
||||
if (scopeStrictModes.size() > 0)
|
||||
scopeStrictModes.pop();
|
||||
useStrictCurrent = true;
|
||||
scopeStrictModes.push(useStrictCurrent);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void IncreaseTemplateDepth() {
|
||||
this.templateDepth++;
|
||||
}
|
||||
|
||||
protected void DecreaseTemplateDepth() {
|
||||
this.templateDepth--;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@code true} if the lexer can match a regex literal.
|
||||
*/
|
||||
protected boolean IsRegexPossible() {
|
||||
|
||||
if (this.lastToken == null) {
|
||||
// No token has been produced yet: at the start of the input,
|
||||
// no division is possible, so a regex literal _is_ possible.
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (this.lastToken.getType()) {
|
||||
case TypeScriptLexer.Identifier:
|
||||
case TypeScriptLexer.NullLiteral:
|
||||
case TypeScriptLexer.BooleanLiteral:
|
||||
case TypeScriptLexer.This:
|
||||
case TypeScriptLexer.CloseBracket:
|
||||
case TypeScriptLexer.CloseParen:
|
||||
case TypeScriptLexer.OctalIntegerLiteral:
|
||||
case TypeScriptLexer.DecimalLiteral:
|
||||
case TypeScriptLexer.HexIntegerLiteral:
|
||||
case TypeScriptLexer.StringLiteral:
|
||||
case TypeScriptLexer.PlusPlus:
|
||||
case TypeScriptLexer.MinusMinus:
|
||||
// After any of the tokens above, no regex literal can follow.
|
||||
return false;
|
||||
default:
|
||||
// In all other cases, a regex literal _is_ possible.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
124
codegpt-core/src/main/java/grammar/TypeScriptParserBase.java
Normal file
124
codegpt-core/src/main/java/grammar/TypeScriptParserBase.java
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
package grammar;
|
||||
|
||||
import org.antlr.v4.runtime.Lexer;
|
||||
import org.antlr.v4.runtime.Parser;
|
||||
import org.antlr.v4.runtime.Token;
|
||||
import org.antlr.v4.runtime.TokenStream;
|
||||
|
||||
/**
|
||||
* All parser methods that used in grammar (p, prev, notLineTerminator, etc.)
|
||||
* should start with lower case char similar to parser rules.
|
||||
*/
|
||||
public abstract class TypeScriptParserBase extends Parser
|
||||
{
|
||||
public TypeScriptParserBase(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Short form for prev(String str)
|
||||
*/
|
||||
protected boolean p(String str) {
|
||||
return prev(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the previous token value equals to @param str
|
||||
*/
|
||||
protected boolean prev(String str) {
|
||||
return _input.LT(-1).getText().equals(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Short form for next(String str)
|
||||
*/
|
||||
protected boolean n(String str) {
|
||||
return next(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the next token value equals to @param str
|
||||
*/
|
||||
protected boolean next(String str) {
|
||||
return _input.LT(1).getText().equals(str);
|
||||
}
|
||||
|
||||
protected boolean notLineTerminator() {
|
||||
return !here(TypeScriptParser.LineTerminator);
|
||||
}
|
||||
|
||||
protected boolean notOpenBraceAndNotFunction() {
|
||||
int nextTokenType = _input.LT(1).getType();
|
||||
return nextTokenType != TypeScriptParser.OpenBrace && nextTokenType != TypeScriptParser.Function_;
|
||||
}
|
||||
|
||||
protected boolean closeBrace() {
|
||||
return _input.LT(1).getType() == TypeScriptParser.CloseBrace;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@code true} iff on the current index of the parser's
|
||||
* token stream a token of the given {@code type} exists on the
|
||||
* {@code HIDDEN} channel.
|
||||
*
|
||||
* @param type
|
||||
* the type of the token on the {@code HIDDEN} channel
|
||||
* to check.
|
||||
*
|
||||
* @return {@code true} iff on the current index of the parser's
|
||||
* token stream a token of the given {@code type} exists on the
|
||||
* {@code HIDDEN} channel.
|
||||
*/
|
||||
private boolean here(final int type) {
|
||||
|
||||
// Get the token ahead of the current index.
|
||||
int possibleIndexEosToken = this.getCurrentToken().getTokenIndex() - 1;
|
||||
Token ahead = _input.get(possibleIndexEosToken);
|
||||
|
||||
// Check if the token resides on the HIDDEN channel and if it's of the
|
||||
// provided type.
|
||||
return (ahead.getChannel() == Lexer.HIDDEN) && (ahead.getType() == type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@code true} iff on the current index of the parser's
|
||||
* token stream a token exists on the {@code HIDDEN} channel which
|
||||
* either is a line terminator, or is a multi line comment that
|
||||
* contains a line terminator.
|
||||
*
|
||||
* @return {@code true} iff on the current index of the parser's
|
||||
* token stream a token exists on the {@code HIDDEN} channel which
|
||||
* either is a line terminator, or is a multi line comment that
|
||||
* contains a line terminator.
|
||||
*/
|
||||
protected boolean lineTerminatorAhead() {
|
||||
|
||||
// Get the token ahead of the current index.
|
||||
int possibleIndexEosToken = this.getCurrentToken().getTokenIndex() - 1;
|
||||
Token ahead = _input.get(possibleIndexEosToken);
|
||||
|
||||
if (ahead.getChannel() != Lexer.HIDDEN) {
|
||||
// We're only interested in tokens on the HIDDEN channel.
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ahead.getType() == TypeScriptParser.LineTerminator) {
|
||||
// There is definitely a line terminator ahead.
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ahead.getType() == TypeScriptParser.WhiteSpaces) {
|
||||
// Get the token ahead of the current whitespaces.
|
||||
possibleIndexEosToken = this.getCurrentToken().getTokenIndex() - 2;
|
||||
ahead = _input.get(possibleIndexEosToken);
|
||||
}
|
||||
|
||||
// Get the token's text and type.
|
||||
String text = ahead.getText();
|
||||
int type = ahead.getType();
|
||||
|
||||
// Check if the token is, or contains a line terminator.
|
||||
return (type == TypeScriptParser.MultiLineComment && (text.contains("\r") || text.contains("\n"))) ||
|
||||
(type == TypeScriptParser.LineTerminator);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue