From 6f45f401168e2a3bb747931f7ba3088927a2f289 Mon Sep 17 00:00:00 2001 From: Infracloud-harsh <148188551+Infracloud-harsh@users.noreply.github.com> Date: Wed, 29 Nov 2023 20:36:16 +0530 Subject: [PATCH 1/5] Feature/prompt limit issue (#3) -> Prompt limit increase with split the prompt if exceeded number of tokens Prompt limit increase with split the prompt if exceeded number of tokens --- .github/workflows/openai-review.yml | 3 +- README.md | 28 +++-- __tests__/tokenizer.test.ts | 47 ++++++++ dist/index.js | 166 ++++++++++++++++++++++++---- dist/package.json | 3 +- src/inputs.ts | 4 + src/prompts.ts | 88 ++++++++++++++- src/review.ts | 60 ++++++++-- src/tokenizer.ts | 27 +++++ 9 files changed, 378 insertions(+), 48 deletions(-) create mode 100644 __tests__/tokenizer.test.ts diff --git a/.github/workflows/openai-review.yml b/.github/workflows/openai-review.yml index c35e3659..e82669ca 100644 --- a/.github/workflows/openai-review.yml +++ b/.github/workflows/openai-review.yml @@ -11,8 +11,7 @@ on: types: [created] concurrency: - group: - ${{ github.repository }}-${{ github.event.number || github.head_ref || + group: ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }}-${{ github.event_name == 'pull_request_review_comment' && 'pr_comment' || 'pr' }} cancel-in-progress: ${{ github.event_name != 'pull_request_review_comment' }} diff --git a/README.md b/README.md index fd6496a4..8fe30734 100644 --- a/README.md +++ b/README.md @@ -43,14 +43,23 @@ configure the required environment variables, such as `GITHUB_TOKEN` and `OPENAI_API_KEY`. For more information on usage, examples, contributing, and FAQs, you can refer to the sections below. -- [Overview](#overview) -- [Professional Version of CodeRabbit](#professional-version-of-coderabbit) -- [Reviewer Features](#reviewer-features) -- [Install instructions](#install-instructions) -- [Conversation with CodeRabbit](#conversation-with-coderabbit) -- [Examples](#examples) -- [Contribute](#contribute) -- [FAQs](#faqs) +- [AI-based PR reviewer and summarizer](#ai-based-pr-reviewer-and-summarizer) + - [Overview](#overview) + - [Reviewer Features:](#reviewer-features) + - [CodeRabbit Pro](#coderabbit-pro) + - [Install instructions](#install-instructions) + - [Environment variables](#environment-variables) + - [Models: `gpt-4` and `gpt-3.5-turbo`](#models-gpt-4-and-gpt-35-turbo) + - [Prompts \& Configuration](#prompts--configuration) + - [Conversation with CodeRabbit](#conversation-with-coderabbit) + - [Ignoring PRs](#ignoring-prs) + - [Examples](#examples) + - [Contribute](#contribute) + - [Developing](#developing) + - [FAQs](#faqs) + - [Review pull requests from forks](#review-pull-requests-from-forks) + - [Inspect the messages between OpenAI server](#inspect-the-messages-between-openai-server) + - [Disclaimer](#disclaimer) ## CodeRabbit Pro @@ -209,12 +218,11 @@ Install the dependencies $ npm install ``` -Build the typescript and package it for distribution - ```bash $ npm run build && npm run package ``` + ## FAQs ### Review pull requests from forks diff --git a/__tests__/tokenizer.test.ts b/__tests__/tokenizer.test.ts new file mode 100644 index 00000000..629be808 --- /dev/null +++ b/__tests__/tokenizer.test.ts @@ -0,0 +1,47 @@ +import {splitPrompt} from './../src/tokenizer' // Import your module with the splitPrompt function + +describe('splitPrompt function', () => { + it('should split a prompt into smaller pieces', async () => { + const maxTokens = 10 // Adjust this as needed + const prompt = 'This is a test prompt for splitting into smaller pieces.' + + const result = await splitPrompt(maxTokens, prompt) + + // Calculate the expected output based on the maxTokens value + const expectedOutput = [ + 'This is a', + 'test', + 'prompt for', + 'splitting', + 'into', + 'smaller', + 'pieces.' + ] + + expect(result).toEqual(expectedOutput) + }) + + it('should handle a prompt smaller than maxTokens', async () => { + const maxTokens = 100 // A large value + const prompt = 'A very short prompt.' + + const result = await splitPrompt(maxTokens, prompt) + + // The prompt is already smaller than maxTokens, so it should return an array with the entire prompt. + const expectedOutput = 'A very short prompt.' + + expect(result).toEqual(expectedOutput) + }) + + it('should handle an empty prompt', async () => { + const maxTokens = 10 + const prompt = '' + + const result = await splitPrompt(maxTokens, prompt) + + // An empty prompt should result in an empty array. + const expectedOutput: string[] | string = '' + + expect(result).toEqual(expectedOutput) + }) +}) diff --git a/dist/index.js b/dist/index.js index a6f49664..f587e707 100644 --- a/dist/index.js +++ b/dist/index.js @@ -4047,7 +4047,7 @@ ${COMMENT_TAG}`; // eslint-disable-next-line camelcase pull_number: pullNumber }); - const pendingReview = reviews.data.find(review => review.state === 'PENDING'); + const pendingReview = reviews.data.find((review) => review.state === 'PENDING'); if (pendingReview) { (0,_actions_core__WEBPACK_IMPORTED_MODULE_0__.info)(`Deleting pending review for PR #${pullNumber} id: ${pendingReview.id}`); try { @@ -4488,7 +4488,7 @@ ${chain} per_page: 100, page }); - allCommits.push(...commits.data.map(commit => commit.sha)); + allCommits.push(...commits.data.map((commit) => commit.sha)); page++; } while (commits.data.length > 0); } @@ -4552,6 +4552,7 @@ class Inputs { diff; commentChain; comment; + fileIndex; constructor(systemMessage = '', title = 'no title provided', description = 'no description provided', rawSummary = '', shortSummary = '', filename = '', fileContent = 'file contents cannot be provided', fileDiff = 'file diff cannot be provided', patches = '', diff = 'no diff', commentChain = 'no other comments on this patch', comment = 'no comment provided') { this.systemMessage = systemMessage; this.title = title; @@ -4609,6 +4610,9 @@ class Inputs { if (this.comment) { content = content.replace('$comment', this.comment); } + if (this.fileIndex) { + content = content.replace('$fileIndex', this.fileIndex.toString()); + } return content; } } @@ -6714,6 +6718,70 @@ to the signatures of exported functions, global data structures and variables, and any changes that might affect the external interface or behavior of the code. `; + splittedSummarizeFileDiff = [ + `## GitHub PR Title + +\`$title\` + +## Description + +\`\`\` +$description +\`\`\` + +## Information + +This is the \`$fileIndex\` section of file differences. + +## Diff + +\`\`\`diff +$file_diff +\`\`\` + +## Instructions + +Please wait for another part of this file then only you can respond according to instructions that I will send in the end. + +`, + ` + ## Information + + This is the \`$fileIndex\` section of file differences. + + ## Diff + +\`\`\`diff +$file_diff +\`\`\` + + ## Instructions + + Please wait for another part of this file then only you can respond according to instructions that I will send in the end. + `, + ` + + ## Information + + This is the last section of file differences. + + ## Diff + + \`\`\`diff + $file_diff + \`\`\` + + + + ## Instructions + + Now, I would like you to succinctly summarize the diff within 500 words. + If applicable, your summary should include a note about alterations + to the signatures of exported functions, global data structures and + variables, and any changes that might affect the external interface or + behavior of the code. +` + ]; triageFileDiff = `Below the summary, I would also like you to triage the diff as \`NEEDS_REVIEW\` or \`APPROVED\` based on the following criteria: @@ -6922,11 +6990,27 @@ $comment this.summarize = summarize; this.summarizeReleaseNotes = summarizeReleaseNotes; } - renderSummarizeFileDiff(inputs, reviewSimpleChanges) { - let prompt = this.summarizeFileDiff; + renderSummarizeFileDiff(inputs, reviewSimpleChanges, fileContentIndex, splitPromptArrLength) { + let prompt; + if (fileContentIndex > 0) { + if (fileContentIndex === 1) { + prompt = this.splittedSummarizeFileDiff[0]; + } + else if (fileContentIndex > 1 && + fileContentIndex < splitPromptArrLength) { + prompt = this.splittedSummarizeFileDiff[1]; + } + else { + prompt = this.splittedSummarizeFileDiff[2]; + } + } + else + prompt = this.summarizeFileDiff; if (reviewSimpleChanges === false) { prompt += this.triageFileDiff; } + if (fileContentIndex > 0) + inputs.fileIndex = fileContentIndex; return inputs.render(prompt); } renderSummarizeChangesets(inputs) { @@ -7040,7 +7124,7 @@ const handleReviewComment = async (heavyBot, options, prompts) => { if (diffAll.data) { const files = diffAll.data.files; if (files != null) { - const file = files.find(f => f.filename === comment.path); + const file = files.find((f) => f.filename === comment.path); if (file != null && file.patch) { fileDiff = file.patch; } @@ -7062,7 +7146,7 @@ const handleReviewComment = async (heavyBot, options, prompts) => { } } // get tokens so far - let tokens = (0,_tokenizer__WEBPACK_IMPORTED_MODULE_4__/* .getTokenCount */ .V)(prompts.renderComment(inputs)); + let tokens = (0,_tokenizer__WEBPACK_IMPORTED_MODULE_4__/* .getTokenCount */ .VV)(prompts.renderComment(inputs)); if (tokens > options.heavyTokenLimits.requestTokens) { await commenter.reviewCommentReply(pullNumber, topLevelComment, 'Cannot reply to this comment as diff being commented is too large and exceeds the token limit.'); return; @@ -7071,7 +7155,7 @@ const handleReviewComment = async (heavyBot, options, prompts) => { if (fileDiff.length > 0) { // count occurrences of $file_diff in prompt const fileDiffCount = prompts.comment.split('$file_diff').length - 1; - const fileDiffTokens = (0,_tokenizer__WEBPACK_IMPORTED_MODULE_4__/* .getTokenCount */ .V)(fileDiff); + const fileDiffTokens = (0,_tokenizer__WEBPACK_IMPORTED_MODULE_4__/* .getTokenCount */ .VV)(fileDiff); if (fileDiffCount > 0 && tokens + fileDiffTokens * fileDiffCount <= options.heavyTokenLimits.requestTokens) { @@ -7084,7 +7168,7 @@ const handleReviewComment = async (heavyBot, options, prompts) => { if (summary) { // pack short summary into the inputs if it is not too long const shortSummary = commenter.getShortSummary(summary.body); - const shortSummaryTokens = (0,_tokenizer__WEBPACK_IMPORTED_MODULE_4__/* .getTokenCount */ .V)(shortSummary); + const shortSummaryTokens = (0,_tokenizer__WEBPACK_IMPORTED_MODULE_4__/* .getTokenCount */ .VV)(shortSummary); if (tokens + shortSummaryTokens <= options.heavyTokenLimits.requestTokens) { tokens += shortSummaryTokens; @@ -7274,10 +7358,10 @@ var tokenizer = __nccwpck_require__(652); // eslint-disable-next-line camelcase -const context = github.context; -const repo = context.repo; +let context = github.context; +let repo = context.repo; const ignoreKeyword = '@coderabbitai: ignore'; -const codeReview = async (lightBot, heavyBot, options, prompts) => { +const codeReview = async (lightBot, heavyBot, options, prompts, requestBody) => { const commenter = new lib_commenter/* Commenter */.Es(); const openaiConcurrencyLimit = pLimit(options.openaiConcurrencyLimit); const githubConcurrencyLimit = pLimit(options.githubConcurrencyLimit); @@ -7347,7 +7431,7 @@ const codeReview = async (lightBot, heavyBot, options, prompts) => { return; } // Filter out any file that is changed compared to the incremental changes - const files = targetBranchFiles.filter(targetBranchFile => incrementalFiles.some(incrementalFile => incrementalFile.filename === targetBranchFile.filename)); + const files = targetBranchFiles.filter((targetBranchFile) => incrementalFiles.some((incrementalFile) => incrementalFile.filename === targetBranchFile.filename)); if (files.length === 0) { (0,core.warning)('Skipped: files is null'); return; @@ -7381,6 +7465,7 @@ const codeReview = async (lightBot, heavyBot, options, prompts) => { (0,core.warning)('Skipped: context.payload.pull_request is null'); return null; } + // TODO========================== split file try { const contents = await octokit/* octokit.repos.getContent */.K.repos.getContent({ owner: repo.owner, @@ -7475,7 +7560,7 @@ ${filterIgnoredFiles.length > 0 // add in progress status to the summarize comment await commenter.comment(`${inProgressSummarizeCmt}`, lib_commenter/* SUMMARIZE_TAG */.Rp, 'replace'); const summariesFailed = []; - const doSummary = async (filename, fileContent, fileDiff) => { + const doSummary = async (filename, fileContent, fileDiff, fileContentIndex = 0, splitPromptArrLength = 0) => { (0,core.info)(`summarize: ${filename}`); const ins = inputs.clone(); if (fileDiff.length === 0) { @@ -7486,8 +7571,9 @@ ${filterIgnoredFiles.length > 0 ins.filename = filename; ins.fileDiff = fileDiff; // render prompt based on inputs so far - const summarizePrompt = prompts.renderSummarizeFileDiff(ins, options.reviewSimpleChanges); - const tokens = (0,tokenizer/* getTokenCount */.V)(summarizePrompt); + const summarizePrompt = prompts.renderSummarizeFileDiff(ins, options.reviewSimpleChanges, fileContentIndex, splitPromptArrLength); + // TODO - for sapliting file if prompt limit exceeds ======================================= + const tokens = (0,tokenizer/* getTokenCount */.VV)(summarizePrompt); if (tokens > options.lightTokenLimits.requestTokens) { (0,core.info)(`summarize: diff tokens exceeds limit, skip ${filename}`); summariesFailed.push(`${filename} (diff tokens exceeds limit)`); @@ -7528,9 +7614,25 @@ ${filterIgnoredFiles.length > 0 }; const summaryPromises = []; const skippedFiles = []; + let promptArray = []; for (const [filename, fileContent, fileDiff] of filesAndChanges) { + // ============================================================================================================= + // check current difference prompt token count + if ((0,tokenizer/* getTokenCount */.VV)(fileDiff) > options.lightTokenLimits.requestTokens) { + promptArray = await (0,tokenizer/* splitPrompt */.gS)(options.lightTokenLimits.requestTokens - 100, fileDiff); + } if (options.maxFiles <= 0 || summaryPromises.length < options.maxFiles) { - summaryPromises.push(openaiConcurrencyLimit(async () => await doSummary(filename, fileContent, fileDiff))); + if (Array.isArray(promptArray) && promptArray.length > 0) { + let index = 1; + for (const promptContent of promptArray) { + summaryPromises.push(openaiConcurrencyLimit(async () => await doSummary(filename, fileContent, promptContent, index, promptArray.length))); + index++; + } + promptArray = []; + } + else { + summaryPromises.push(openaiConcurrencyLimit(async () => await doSummary(filename, fileContent, fileDiff))); + } } else { skippedFiles.push(filename); @@ -7642,11 +7744,11 @@ ${summariesFailed.length > 0 const ins = inputs.clone(); ins.filename = filename; // calculate tokens based on inputs so far - let tokens = (0,tokenizer/* getTokenCount */.V)(prompts.renderReviewFileDiff(ins)); + let tokens = (0,tokenizer/* getTokenCount */.VV)(prompts.renderReviewFileDiff(ins)); // loop to calculate total patch tokens let patchesToPack = 0; for (const [, , patch] of patches) { - const patchTokens = (0,tokenizer/* getTokenCount */.V)(patch); + const patchTokens = (0,tokenizer/* getTokenCount */.VV)(patch); if (tokens + patchTokens > options.heavyTokenLimits.requestTokens) { (0,core.info)(`only packing ${patchesToPack} / ${patches.length} patches, tokens: ${tokens} / ${options.heavyTokenLimits.requestTokens}`); break; @@ -7681,7 +7783,7 @@ ${summariesFailed.length > 0 (0,core.warning)(`Failed to get comments: ${e}, skipping. backtrace: ${e.stack}`); } // try packing comment_chain into this request - const commentChainTokens = (0,tokenizer/* getTokenCount */.V)(commentChain); + const commentChainTokens = (0,tokenizer/* getTokenCount */.VV)(commentChain); if (tokens + commentChainTokens > options.heavyTokenLimits.requestTokens) { commentChain = ''; @@ -8019,7 +8121,8 @@ ${review.comment}`; "use strict"; /* harmony export */ __nccwpck_require__.d(__webpack_exports__, { -/* harmony export */ "V": () => (/* binding */ getTokenCount) +/* harmony export */ "VV": () => (/* binding */ getTokenCount), +/* harmony export */ "gS": () => (/* binding */ splitPrompt) /* harmony export */ }); /* unused harmony export encode */ /* harmony import */ var _dqbd_tiktoken__WEBPACK_IMPORTED_MODULE_0__ = __nccwpck_require__(3171); @@ -8033,6 +8136,29 @@ function getTokenCount(input) { input = input.replace(/<\|endoftext\|>/g, ''); return encode(input).length; } +function splitPrompt(maxTokens, prompt) { + if (getTokenCount(prompt) < maxTokens) { + return prompt; + } + const promptPieces = []; + let remainingPrompt = prompt; + while (remainingPrompt.length > 0) { + const lastSpaceIndex = remainingPrompt.lastIndexOf(' ', maxTokens); + if (lastSpaceIndex >= 0) { + // Split at the last space + const piece = remainingPrompt.substring(0, lastSpaceIndex).trim(); + promptPieces.push(piece); + remainingPrompt = remainingPrompt.substring(lastSpaceIndex).trim(); + } + else { + // If no space found in the next `maxTokens` characters, split at `maxTokens` directly + const piece = remainingPrompt.substring(0, maxTokens).trim(); + promptPieces.push(piece); + remainingPrompt = remainingPrompt.substring(maxTokens).trim(); + } + } + return promptPieces; +} /***/ }), diff --git a/dist/package.json b/dist/package.json index 2c63c085..8b137891 100644 --- a/dist/package.json +++ b/dist/package.json @@ -1,2 +1 @@ -{ -} + diff --git a/src/inputs.ts b/src/inputs.ts index c391d567..d3d89c8f 100644 --- a/src/inputs.ts +++ b/src/inputs.ts @@ -11,6 +11,7 @@ export class Inputs { diff: string commentChain: string comment: string + fileIndex: number | undefined constructor( systemMessage = '', @@ -97,6 +98,9 @@ export class Inputs { if (this.comment) { content = content.replace('$comment', this.comment) } + if (this.fileIndex) { + content = content.replace('$fileIndex', this.fileIndex.toString()) + } return content } } diff --git a/src/prompts.ts b/src/prompts.ts index 32167d85..cbe44f1c 100644 --- a/src/prompts.ts +++ b/src/prompts.ts @@ -28,6 +28,74 @@ to the signatures of exported functions, global data structures and variables, and any changes that might affect the external interface or behavior of the code. ` + + splittedSummarizeFileDiff = [ + `## GitHub PR Title + +\`$title\` + +## Description + +\`\`\` +$description +\`\`\` + +## Information + +This is the \`$fileIndex\` section of file differences. + +## Diff + +\`\`\`diff +$file_diff +\`\`\` + +## Instructions + +Please wait for another part of this file then only you can respond according to instructions that I will send in the end. + +`, + + ` + ## Information + + This is the \`$fileIndex\` section of file differences. + + ## Diff + +\`\`\`diff +$file_diff +\`\`\` + + ## Instructions + + Please wait for another part of this file then only you can respond according to instructions that I will send in the end. + `, + + ` + + ## Information + + This is the last section of file differences. + + ## Diff + + \`\`\`diff + $file_diff + \`\`\` + + + + ## Instructions + + Now, I would like you to succinctly summarize the diff within 500 words. + If applicable, your summary should include a note about alterations + to the signatures of exported functions, global data structures and + variables, and any changes that might affect the external interface or + behavior of the code. +` + ] + triageFileDiff = `Below the summary, I would also like you to triage the diff as \`NEEDS_REVIEW\` or \`APPROVED\` based on the following criteria: @@ -244,12 +312,28 @@ $comment renderSummarizeFileDiff( inputs: Inputs, - reviewSimpleChanges: boolean + reviewSimpleChanges: boolean, + fileContentIndex: number, + splitPromptArrLength: number ): string { - let prompt = this.summarizeFileDiff + let prompt + if (fileContentIndex > 0) { + if (fileContentIndex === 1) { + prompt = this.splittedSummarizeFileDiff[0] + } else if ( + fileContentIndex > 1 && + fileContentIndex < splitPromptArrLength + ) { + prompt = this.splittedSummarizeFileDiff[1] + } else { + prompt = this.splittedSummarizeFileDiff[2] + } + } else prompt = this.summarizeFileDiff + if (reviewSimpleChanges === false) { prompt += this.triageFileDiff } + if (fileContentIndex > 0) inputs.fileIndex = fileContentIndex return inputs.render(prompt) } diff --git a/src/review.ts b/src/review.ts index 5e7dd9db..9c46af67 100644 --- a/src/review.ts +++ b/src/review.ts @@ -16,11 +16,11 @@ import {Inputs} from './inputs' import {octokit} from './octokit' import {type Options} from './options' import {type Prompts} from './prompts' -import {getTokenCount} from './tokenizer' +import {getTokenCount, splitPrompt} from './tokenizer' // eslint-disable-next-line camelcase -const context = github_context -const repo = context.repo +let context: any = github_context +let repo = context.repo const ignoreKeyword = '@coderabbitai: ignore' @@ -28,7 +28,8 @@ export const codeReview = async ( lightBot: Bot, heavyBot: Bot, options: Options, - prompts: Prompts + prompts: Prompts, + requestBody?: Request | null ): Promise => { const commenter: Commenter = new Commenter() @@ -131,9 +132,10 @@ export const codeReview = async ( } // Filter out any file that is changed compared to the incremental changes - const files = targetBranchFiles.filter(targetBranchFile => + const files = targetBranchFiles.filter((targetBranchFile: {filename: any}) => incrementalFiles.some( - incrementalFile => incrementalFile.filename === targetBranchFile.filename + (incrementalFile: {filename: any}) => + incrementalFile.filename === targetBranchFile.filename ) ) @@ -309,7 +311,9 @@ ${ const doSummary = async ( filename: string, fileContent: string, - fileDiff: string + fileDiff: string, + fileContentIndex: number = 0, + splitPromptArrLength: number = 0 ): Promise<[string, string, boolean] | null> => { info(`summarize: ${filename}`) const ins = inputs.clone() @@ -325,8 +329,12 @@ ${ // render prompt based on inputs so far const summarizePrompt = prompts.renderSummarizeFileDiff( ins, - options.reviewSimpleChanges + options.reviewSimpleChanges, + fileContentIndex, + splitPromptArrLength ) + + // spliting file if prompt limit exceeds const tokens = getTokenCount(summarizePrompt) if (tokens > options.lightTokenLimits.requestTokens) { @@ -372,13 +380,41 @@ ${ const summaryPromises = [] const skippedFiles = [] + let promptArray: string[] | string = [] for (const [filename, fileContent, fileDiff] of filesAndChanges) { + // check current difference prompt token count + if (getTokenCount(fileDiff) > options.lightTokenLimits.requestTokens) { + promptArray = await splitPrompt( + options.lightTokenLimits.requestTokens - 100, + fileDiff + ) + } if (options.maxFiles <= 0 || summaryPromises.length < options.maxFiles) { - summaryPromises.push( - openaiConcurrencyLimit( - async () => await doSummary(filename, fileContent, fileDiff) + if (Array.isArray(promptArray) && promptArray.length > 0) { + let index = 1 + for (const promptContent of promptArray) { + summaryPromises.push( + openaiConcurrencyLimit( + async () => + await doSummary( + filename, + fileContent, + promptContent, + index, + promptArray.length + ) + ) + ) + index++ + } + promptArray = [] + } else { + summaryPromises.push( + openaiConcurrencyLimit( + async () => await doSummary(filename, fileContent, fileDiff) + ) ) - ) + } } else { skippedFiles.push(filename) } diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 61e230d6..3b4330b4 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -11,3 +11,30 @@ export function getTokenCount(input: string): number { input = input.replace(/<\|endoftext\|>/g, '') return encode(input).length } + +export function splitPrompt( + maxTokens: number, + prompt: string +): string[] | string { + if (getTokenCount(prompt) < maxTokens) { + return prompt + } + const promptPieces: string[] = [] + let remainingPrompt = prompt + while (remainingPrompt.length > 0) { + const lastSpaceIndex = remainingPrompt.lastIndexOf(' ', maxTokens) + if (lastSpaceIndex >= 0) { + // Split at the last space + const piece = remainingPrompt.substring(0, lastSpaceIndex).trim() + promptPieces.push(piece) + remainingPrompt = remainingPrompt.substring(lastSpaceIndex).trim() + } else { + // If no space found in the next `maxTokens` characters, split at `maxTokens` directly + const piece = remainingPrompt.substring(0, maxTokens).trim() + promptPieces.push(piece) + remainingPrompt = remainingPrompt.substring(maxTokens).trim() + } + } + + return promptPieces +} From 11f2846771a7302beb868edfad57a41b9f947dde Mon Sep 17 00:00:00 2001 From: Infracloud-harsh <148188551+Infracloud-harsh@users.noreply.github.com> Date: Thu, 30 Nov 2023 23:00:56 +0530 Subject: [PATCH 2/5] revert syntax changes openai-review.yml --- .github/workflows/openai-review.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/openai-review.yml b/.github/workflows/openai-review.yml index e82669ca..c35e3659 100644 --- a/.github/workflows/openai-review.yml +++ b/.github/workflows/openai-review.yml @@ -11,7 +11,8 @@ on: types: [created] concurrency: - group: ${{ github.repository }}-${{ github.event.number || github.head_ref || + group: + ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }}-${{ github.event_name == 'pull_request_review_comment' && 'pr_comment' || 'pr' }} cancel-in-progress: ${{ github.event_name != 'pull_request_review_comment' }} From 44a12fb81b7db96277cafd3e71a072a098914b35 Mon Sep 17 00:00:00 2001 From: Infracloud-harsh <148188551+Infracloud-harsh@users.noreply.github.com> Date: Thu, 30 Nov 2023 23:05:08 +0530 Subject: [PATCH 3/5] Update tokenizer.ts --- src/tokenizer.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 3b4330b4..bed2596a 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -22,6 +22,9 @@ export function splitPrompt( const promptPieces: string[] = [] let remainingPrompt = prompt while (remainingPrompt.length > 0) { + if (remainingPrompt.length > maxTokens && !remainingPrompt.includes(' ')) { + throw new Error(`Word length exceeds maxTokens: ${maxTokens}`); + } const lastSpaceIndex = remainingPrompt.lastIndexOf(' ', maxTokens) if (lastSpaceIndex >= 0) { // Split at the last space From 4cf4e4da6eb79552b478ef85aa91a4c930b4fc4b Mon Sep 17 00:00:00 2001 From: Infracloud-harsh <148188551+Infracloud-harsh@users.noreply.github.com> Date: Fri, 1 Dec 2023 01:07:53 +0530 Subject: [PATCH 4/5] Update tokenizer.ts as per the codereviwer ai comment --- src/tokenizer.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.ts b/src/tokenizer.ts index bed2596a..563bda57 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -23,7 +23,9 @@ export function splitPrompt( let remainingPrompt = prompt while (remainingPrompt.length > 0) { if (remainingPrompt.length > maxTokens && !remainingPrompt.includes(' ')) { - throw new Error(`Word length exceeds maxTokens: ${maxTokens}`); + const piece = remainingPrompt.substring(0, maxTokens).trim(); + promptPieces.push(piece); + remainingPrompt = remainingPrompt.substring(maxTokens).trim(); } const lastSpaceIndex = remainingPrompt.lastIndexOf(' ', maxTokens) if (lastSpaceIndex >= 0) { From 9642c4539d03ddb236598a1e4a21b91a27a49730 Mon Sep 17 00:00:00 2001 From: Infracloud-harsh <148188551+Infracloud-harsh@users.noreply.github.com> Date: Fri, 1 Dec 2023 02:31:47 +0530 Subject: [PATCH 5/5] Update tokenizer.ts --- src/tokenizer.ts | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 563bda57..16d8b6f6 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -12,6 +12,7 @@ export function getTokenCount(input: string): number { return encode(input).length } + export function splitPrompt( maxTokens: number, prompt: string @@ -22,24 +23,24 @@ export function splitPrompt( const promptPieces: string[] = [] let remainingPrompt = prompt while (remainingPrompt.length > 0) { - if (remainingPrompt.length > maxTokens && !remainingPrompt.includes(' ')) { - const piece = remainingPrompt.substring(0, maxTokens).trim(); - promptPieces.push(piece); - remainingPrompt = remainingPrompt.substring(maxTokens).trim(); - } - const lastSpaceIndex = remainingPrompt.lastIndexOf(' ', maxTokens) - if (lastSpaceIndex >= 0) { - // Split at the last space - const piece = remainingPrompt.substring(0, lastSpaceIndex).trim() - promptPieces.push(piece) - remainingPrompt = remainingPrompt.substring(lastSpaceIndex).trim() + let piece; + if (remainingPrompt.length > maxTokens && !remainingPrompt.includes(' ')) { + piece = remainingPrompt.substring(0, maxTokens).trim(); } else { - // If no space found in the next `maxTokens` characters, split at `maxTokens` directly - const piece = remainingPrompt.substring(0, maxTokens).trim() - promptPieces.push(piece) - remainingPrompt = remainingPrompt.substring(maxTokens).trim() + const lastSpaceIndex = remainingPrompt.lastIndexOf(' ', maxTokens) + if (lastSpaceIndex >= 0) { + // Split at the last space + piece = remainingPrompt.substring(0, lastSpaceIndex).trim() + } else { + // If no space found in the next `maxTokens` characters, split at `maxTokens` directly + piece = remainingPrompt.substring(0, maxTokens).trim() + } } + promptPieces.push(piece) + remainingPrompt = remainingPrompt.substring(piece.length).trim() } return promptPieces } + +