From 41a63194d8aeafcc78d56c94e2b935e8ed57a4e2 Mon Sep 17 00:00:00 2001 From: Eric allen Date: Fri, 22 Sep 2023 10:43:06 -0400 Subject: [PATCH] chore: clean up notebook and README --- README.md | 22 +- SentimentAnalysisWorkshop.ipynb | 672 ++++++++++++++++++++------------ utils/array.py | 7 + widgets/simple.py | 31 +- 4 files changed, 447 insertions(+), 285 deletions(-) create mode 100644 utils/array.py diff --git a/README.md b/README.md index 1364331..6cb4513 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,26 @@ In this interactive workshop powered by a Jupyter Notebook, we'll explore the basics of traditional Sentiment Analysis and how we can expand our sentiment analysis capabilities with ChatGPT and some clever prompting strategies. -## Interactive Notebook +## Web-based Notebook [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ericrallen/sentiment-analysis-notebook/main) -In addition to being available on Binder, this [workshop notebook](https://ericrallen.github.io/sentiment-analysis-notebook/) is automatically deployed to GitHub Pages. +In addition to being available on Binder, this [workshop notebook](https://ericrallen.github.io/sentiment-analysis-notebook/) is automatically deployed to GitHub Pages via [`jupyter-book`](https://jupyterbook.org/intro.html). -**Note**: Unfortunately, the `ipywidgets` library used to create interactive demonstrations in the notebook does not seem to cooperate with GitHub Pages. This is my first Jupyter Notebook, so it's likely that I misconfigured something. +**Note**: Unfortunately, the `ipywidgets` library used to create interactive demonstrations in the notebook does not seem to cooperate with GitHub Pages or the interactive cells provided by [Thebe](https://jupyterbook.org/en/stable/interactive/thebe.html?highlight=thebe). This is my first Jupyter Notebook, so it's likely that I misconfigured something. -## Pre-requisites +**Note**: If you are running this notebook locally, it will look for the `OPENAI_API_KEY` environment variable, but you can also manually enter your OpenAI API key into the notebook. + +## Getting Started + +To run and interact with this notebook locally, follow the instructions below. + +### Pre-requisites 1. Python `>=3.11` 2. [OpenAI API Key](https://platform.openai.com/account/api-keys) -**Note**: If you are running this notebook locally, it will look for the `OPENAI_API_KEY` environment variable, but you can also manually enter your OpenAI API key into the notebook. - -## Getting Started +### Installation 1. Clone this repository @@ -43,7 +47,3 @@ In addition to being available on Binder, this [workshop notebook](https://ericr ```shell jupyter notebook ``` - -## Resources - -_Coming Soon_ diff --git a/SentimentAnalysisWorkshop.ipynb b/SentimentAnalysisWorkshop.ipynb index b284430..3b38db6 100644 --- a/SentimentAnalysisWorkshop.ipynb +++ b/SentimentAnalysisWorkshop.ipynb @@ -24,9 +24,9 @@ "source": [ "# Sentiment Analysis with ChatGPT\n", "\n", - "While sentiment analysis is sort of like the [\"Hello, world!\"](https://en.wikipedia.org/wiki/%22Hello,_World!%22_program#Variations) of Natural Language Processing (NLP), luckily for us it's a bit more fun than just echoing out a string.\n", + "While sentiment analysis is sort of like the [\"Hello, world!\"](https://en.wikipedia.org/wiki/%22Hello,_World!%22_program#Variations) of [Natural Language Processing](https://en.wikipedia.org/wiki/Natural_language_processing) (NLP), luckily for us it's a bit more fun than just echoing out a string.\n", "\n", - "This notebook will introduce you to sentiment analysis using traditional NLP tools and then explore analyzing sentiment with ChatGPT.\n", + "This notebook will introduce you to sentiment analysis using traditional NLP tools and then explore analyzing sentiment with [ChatGPT](https://openai.com/blog/chatgpt).\n", "\n", "**Note**: For a better learning experience, this notebook contains some code cells that are only used to render widgets for you to interact with and some others that only generate data structures or variables that later cells will reference.\n", "\n", @@ -34,21 +34,13 @@ "\n", "[Sentiment Analysis](https://en.wikipedia.org/wiki/Sentiment_analysis) is a way of analyzing some text to determine if it's positive, negative, or neutral.\n", "\n", - "This is the kind of thing that's pretty easy for a human who understands the language the text is written in, but it can be hard for a computer to really understand the underlying meaning behind language.\n", + "This is the kind of thing that's pretty easy for a human who understands the language the text is written in to do, but it can be hard for a computer to really understand the underlying meaning behind the language.\n", "\n", "### Examples\n", "\n", - "#### Neutral\n", - "\n", - "> I saw that movie.\n", - "\n", - "#### Positive\n", - "\n", - "> I love that movie\n", - "\n", - "#### Negative\n", - "\n", - "> I hate that movie.\n" + "- \"I saw that movie.\" (neutral)\n", + "- \"I love that movie.\" (positive)\n", + "- \"I hate that movie.\" (negative)\n" ] }, { @@ -60,10 +52,10 @@ "\n", "First, we'll import the relevant tools we'll be using in the notebook and configure some global variables.\n", "\n", - "- `nltk`: Python's [Natural Language Toolkit](https://www.nltk.org/)\n", + "- `nltk`: Python's [Natural Language Toolkit](https://www.nltk.org/), which we'll use to explore some more traditional sentiment analysis techniques\n", "- `openai`: Python library for interacting with the [OpenAI API](https://platform.openai.com/docs/api-reference/introduction)\n", "\n", - "You'll be able to configure these global variables using an embedded widget form below.\n" + "**Note**: In a later cell, we'll also make use of [`nrclex`](https://github.com/metalcorebear/NRCLex) to investigate some more advanced NLP, but because it's only used in one cell, we're importing it there for clarity.\n" ] }, { @@ -83,8 +75,8 @@ "import openai\n", "\n", "# download nltk data\n", - "nltk.download('vader_lexicon')\n", - "nltk.download('punkt')\n", + "nltk.download(\"vader_lexicon\")\n", + "nltk.download(\"punkt\")\n", "\n", "# globals\n", "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", @@ -92,6 +84,14 @@ "STORY_SAMPLE_SIZE = 12" ] }, + { + "cell_type": "markdown", + "id": "db52a5a8", + "metadata": {}, + "source": [ + "You'll be able to configure these global variables using an embedded widget form below.\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -104,15 +104,25 @@ }, "outputs": [], "source": [ - "# this cell focuses on some implemetation details for this notebook\n", - "# that aren't actually important to the workshop\n", + "# this cell focuses on some implemetation details specific to\n", + "# this notebook that aren't actually important to understand\n", + "# you can just ignore/collapse it if you would prefer\n", "import ipywidgets as pywidgets\n", "import requests as request\n", "import requests_cache\n", "import backoff\n", "\n", "# configuration widgets\n", - "from widgets.config import modelDropdown, apiKeyInput, apiKeyUpdateButton, temperatureSlider, sampleSizeSlider, sampleSizeWarningLabel, openAiHeader, hackerNewsHeader\n", + "from widgets.config import (\n", + " modelDropdown,\n", + " apiKeyInput,\n", + " apiKeyUpdateButton,\n", + " temperatureSlider,\n", + " sampleSizeSlider,\n", + " sampleSizeWarningLabel,\n", + " openAiHeader,\n", + " hackerNewsHeader,\n", + ")\n", "\n", "# project-specific widgets\n", "from widgets.simple import simpleAnalysisWidget\n", @@ -121,10 +131,17 @@ "\n", "# project-specific utilities\n", "from utils.obfuscate import obfuscateKey\n", + "from utils.array import checkArrayLengths\n", + "\n", + "# we don't want to display too many entries in our DataFrames\n", + "# if the sample size is too large\n", + "DATAFRAME_LIMIT = 20\n", "\n", "# we'll use this session to cache our hacker news api requests\n", "REQUEST_CACHE_EXPIRATION_SECONDS = 60 * 15\n", - "session = requests_cache.CachedSession('hackernews_cache', expire_after=REQUEST_CACHE_EXPIRATION_SECONDS)" + "session = requests_cache.CachedSession(\n", + " \"hackernews_cache\", expire_after=REQUEST_CACHE_EXPIRATION_SECONDS\n", + ")" ] }, { @@ -156,33 +173,41 @@ }, "outputs": [], "source": [ - "# this code cell is just used to display a widget\n", - "# for us to configure some settings that other cells\n", - "# in this notebook rely on\n", + "# this code cell is just used to display a widget for us to\n", + "# configure some settings that other cells in this notebook rely on\n", + "# you can just ignore/collapse it if you would prefer\n", "apiKeyInput.value = obfuscateKey(OPENAI_API_KEY)\n", "sampleSizeSlider.value = STORY_SAMPLE_SIZE\n", "temperatureSlider.value = TEMPERATURE\n", "\n", + "\n", "def updateApiKey(event):\n", - " global OPENAI_API_KEY\n", - " OPENAI_API_KEY = apiKeyInput.value\n", - " apiKeyInput.value = obfuscateKey(OPENAI_API_KEY)\n", + " global OPENAI_API_KEY\n", + " OPENAI_API_KEY = apiKeyInput.value\n", + " apiKeyInput.value = obfuscateKey(OPENAI_API_KEY)\n", + "\n", "\n", "def updateSampleSize(change):\n", - " global STORY_SAMPLE_SIZE\n", - " STORY_SAMPLE_SIZE = change['new']\n", + " global STORY_SAMPLE_SIZE\n", + " STORY_SAMPLE_SIZE = change[\"new\"]\n", + "\n", "\n", "def updateTemperature(change):\n", - " global TEMPERATURE\n", - " TEMPERATURE = change['new']\n", + " global TEMPERATURE\n", + " TEMPERATURE = change[\"new\"]\n", + "\n", "\n", - "temperatureSlider.observe(updateTemperature, names='value')\n", - "sampleSizeSlider.observe(updateSampleSize, names='value')\n", + "temperatureSlider.observe(updateTemperature, names=\"value\")\n", + "sampleSizeSlider.observe(updateSampleSize, names=\"value\")\n", "apiKeyUpdateButton.on_click(updateApiKey)\n", "\n", "apiKeyConfigWidget = pywidgets.HBox([apiKeyInput, apiKeyUpdateButton])\n", - "openAiConfigWidget = pywidgets.VBox([openAiHeader, apiKeyConfigWidget, modelDropdown, temperatureSlider])\n", - "hackerNewsConfigWidget = pywidgets.VBox([hackerNewsHeader, sampleSizeSlider, sampleSizeWarningLabel])\n", + "openAiConfigWidget = pywidgets.VBox(\n", + " [openAiHeader, apiKeyConfigWidget, modelDropdown, temperatureSlider]\n", + ")\n", + "hackerNewsConfigWidget = pywidgets.VBox(\n", + " [hackerNewsHeader, sampleSizeSlider, sampleSizeWarningLabel]\n", + ")\n", "configWidget = pywidgets.VBox([openAiConfigWidget, hackerNewsConfigWidget])\n", "\n", "display(configWidget)" @@ -195,11 +220,11 @@ "source": [ "## Simple sentiment analysis with NLTK\n", "\n", - "Let's take a look at a simple example of sentiment analysis with `nltk` using the Valence Aware Dictionary and sEntiment Reasoner ([VADER](https://vadersentiment.readthedocs.io/en/latest/pages/introduction.html)) module.\n", + "Let's take a look at a simple example of sentiment analysis with `nltk` using the **V**alence **A**ware **D**ictionary and s**E**ntiment **R**easoner ([VADER](https://vadersentiment.readthedocs.io/en/latest/pages/introduction.html)) module.\n", "\n", "VADER's `SentimentIntensityAnalyzer` returns an object with positive, negative, and neutral scores for the given text as well as a combined `compound` score computed from the other three.\n", "\n", - "For this basic example, we're going to rely on the `compound` score and create a naive rating scale that converts that score into a string representation of the sentiment.\n" + "For this basic example, we're going to rely on the `compound` score and create a naive rating scale that converts that score into a string ranging from `very positive` to `very negative`\n" ] }, { @@ -213,38 +238,44 @@ "\n", "analyzer = SentimentIntensityAnalyzer()\n", "\n", + "\n", + "def convertSentimentToLabel(sentiment):\n", + " sentimentScore = sentiment[\"compound\"]\n", + "\n", + " if sentimentScore >= 0.75:\n", + " return \"very positive\"\n", + " elif sentimentScore >= 0.4:\n", + " return \"positive\"\n", + " elif sentimentScore >= 0.1:\n", + " return \"leaning positive\"\n", + " elif sentimentScore <= -0.1 and sentimentScore > -0.4:\n", + " return \"leaning negative\"\n", + " elif sentimentScore <= -0.4 and sentimentScore > -0.75:\n", + " return \"negative\"\n", + " elif sentimentScore <= -0.75:\n", + " return \"very negative\"\n", + " else:\n", + " return \"neutral\"\n", + "\n", + "\n", "def analyzeSentiment(text):\n", - " if not text:\n", - " return('')\n", - "\n", - " sentiment = analyzer.polarity_scores(text)\n", - "\n", - " if sentiment['compound'] >= 0.75:\n", - " return('very positive')\n", - " elif sentiment['compound'] >= 0.4:\n", - " return('positive')\n", - " elif sentiment['compound'] >= 0.1:\n", - " return('leaning positive')\n", - " elif sentiment['compound'] <= -0.1 and sentiment['compound'] > -0.4:\n", - " return('leaning negative')\n", - " elif sentiment['compound'] <= -0.4 and sentiment['compound'] > -0.75:\n", - " return('negative')\n", - " elif sentiment['compound'] <= -0.75:\n", - " return('very negative')\n", - " else:\n", - " return('neutral')\n", - " \n", + " if not text:\n", + " return \"\"\n", + "\n", + " return analyzer.polarity_scores(text)\n", + "\n", + "\n", "# some simple test statements for our analyzer\n", "statements = [\n", - " 'I love that movie.',\n", - " 'I hate that movie.',\n", - " 'I like that movie.',\n", - " 'I dislike that movie.',\n", - " 'I saw that movie.',\n", + " \"I love that movie.\",\n", + " \"I hate that movie.\",\n", + " \"I like that movie.\",\n", + " \"I dislike that movie.\",\n", + " \"I saw that movie.\",\n", "]\n", "\n", "for statement in statements:\n", - " print(f\"{statement} ({analyzeSentiment(statement)})\")" + " print(f\"{statement} ({convertSentimentToLabel(analyzeSentiment(statement))})\")" ] }, { @@ -269,6 +300,7 @@ "source": [ "# this code cell is just used to display a widget\n", "# that uses the analyzeSentiment function we created\n", + "# you can just ignore/collapse it if you would prefer\n", "display(simpleAnalysisWidget)" ] }, @@ -277,7 +309,7 @@ "id": "50bbf2a5", "metadata": {}, "source": [ - "## How Sentiment Analysis Works\n", + "## How it works\n", "\n", "Sentiment analysis, like most text analysis involves a multistep process:\n", "\n", @@ -288,16 +320,22 @@ "3. **Vectorization**: converts the tokens into a id that can be used for comparison\n", "4. **Comparison**: compares the tokens to a known set of tokens to determine the sentiment\n", "\n", - "In this case we're taking advantage of an existing model that has been trained to analyze sentiment in text. If we wanted to build our own from scratch, it would be a more complicated process and require training data to feed into the model.\n", - "\n", - "With the advent of Generative Pre-Trained Transformer (GPT) models like those that power ChatGPT, and other transformer models that have exploded in popularity since, we can leverage the powerful inference and predictive capabilities of these models to perform sentiment analysis without having to train our own model, and we can even leverage some prompting techniques to quickly teach the model how to perform more unique analyses.\n", + "**Note**: This is a simplification of the process to distill it into an easy to digest format, but it is not a full picture and doesn't include the data gathering, cleaning, and labeling or actual training process.\n", "\n", "### Learn more\n", "\n", "- [Tokenization, Stemming, and Lemmatization in Python](https://thepythoncode.com/article/tokenization-stemming-and-lemmatization-in-python)\n", "- [Python for NLP: Tokenization, Stemming, and Lemmatization with SpaCy Library](https://stackabuse.com/python-for-nlp-tokenization-stemming-and-lemmatization-with-spacy-library/)\n", "- [What is Tokenization in Natural Language Processing (NLP)?](https://www.machinelearningplus.com/nlp/what-is-tokenization-in-natural-language-processing/)\n", - "- [Understanding NLP Word Embeddings โ€” Text Vectorization](https://towardsdatascience.com/understanding-nlp-word-embeddings-text-vectorization-1a23744f7223)\n" + "- [Understanding NLP Word Embeddings โ€” Text Vectorization](https://towardsdatascience.com/understanding-nlp-word-embeddings-text-vectorization-1a23744f7223)\n", + "\n", + "### Language models\n", + "\n", + "In this case we're taking advantage of an existing [language model](https://en.wikipedia.org/wiki/Language_model), VADER, that has been trained to analyze sentiment in text, but if we wanted to train our own model, it would be a much more involved process.\n", + "\n", + "With the advent of [Large Language Models](https://en.wikipedia.org/wiki/Large_language_model) (LLMs), like the [Generative Pre-Trained Transformer](https://en.wikipedia.org/wiki/Generative_pre-trained_transformer) (GPT) models that power ChatGPT - and the various [other models that have exploded in popularity](https://informationisbeautiful.net/visualizations/the-rise-of-generative-ai-large-language-models-llms-like-chatgpt/) since - we can leverage the powerful inference and predictive capabilities of these models to perform tasks like sentiment analysis with greater accuracy without having to train our own models.\n", + "\n", + "We can even leverage some prompting techniques - which we'll explore in later cells - to quickly teach the model how to perform more unique analyses and refine our results.\n" ] }, { @@ -307,11 +345,13 @@ "source": [ "## Real world example\n", "\n", - "So, let's see how this works with text generated by other humans without knowing that someone would be trying to analyze the sentiment of their text.\n", + "Let's take a look at how this works with text generated by other humans (_probably_) without expecting someone would be trying to analyze the sentiment of their text.\n", "\n", "For this example, we'll pull in a random sample of the [top stories](https://github.com/HackerNews/API#new-top-and-best-stories) on [Hacker News](https://news.ycombinator.com/) and analyze the sentiment of each submission's title.\n", "\n", - "You can run the cell below a few times to generate different samples of the top stories until you find a collection you prefer and then rerun the cells after it to use that sample for the rest of the notebook.\n" + "You can run the cell below a few times to generate different samples of the top stories until you find a collection you prefer and then rerun the cells after it to use that sample for the rest of the notebook.\n", + "\n", + "**Note**: You can use the configuration widget above to adjust your sample size to find the collection of data that feels right to you.\n" ] }, { @@ -323,57 +363,60 @@ "source": [ "import numpy as np\n", "\n", - "def getSampleStories(sampleSize = STORY_SAMPLE_SIZE):\n", - " topStoryIdsRequest = session.get('https://hacker-news.firebaseio.com/v0/topstories.json')\n", "\n", - " if topStoryIdsRequest.status_code != 200:\n", - " print('There was a problem getting the top stories from Hacker News')\n", - " exit()\n", + "def sampleStories(sampleSize=STORY_SAMPLE_SIZE):\n", + " topStoryIdsRequest = session.get(\n", + " \"https://hacker-news.firebaseio.com/v0/topstories.json\"\n", + " )\n", "\n", - " topStoryIds = topStoryIdsRequest.json()\n", + " if topStoryIdsRequest.status_code != 200:\n", + " print(\"There was a problem getting the top stories from Hacker News\")\n", + " exit()\n", "\n", - " storyIds = np.array(topStoryIds)[np.random.choice(len(topStoryIds), sampleSize, replace=False)]\n", + " topStoryIds = topStoryIdsRequest.json()\n", "\n", - " return storyIds\n", + " storyIds = np.array(topStoryIds)[\n", + " np.random.choice(len(topStoryIds), sampleSize, replace=False)\n", + " ]\n", + "\n", + " return storyIds\n", "\n", "\n", "def getStoryDetails(storyId):\n", - " # we'll use the same request cache so that we don't have to request a story's details more than once\n", - " storyRequest = session.get(f'https://hacker-news.firebaseio.com/v0/item/{storyId}.json')\n", + " # we'll use the same request cache so that we don't have to request a story's details more than once\n", + " storyRequest = session.get(\n", + " f\"https://hacker-news.firebaseio.com/v0/item/{storyId}.json\"\n", + " )\n", "\n", - " if storyRequest.status_code != 200:\n", - " print(f'There was a problem getting story {storyId} from Hacker News')\n", - " return None\n", - " else:\n", - " story = storyRequest.json()\n", + " if storyRequest.status_code != 200:\n", + " print(f\"There was a problem getting story {storyId} from Hacker News\")\n", + " return None\n", + " else:\n", + " story = storyRequest.json()\n", "\n", - " return story\n", + " return story\n", "\n", "\n", "def getStories(storyIds):\n", - " stories = {}\n", + " stories = {}\n", "\n", - " for storyId in storyIds:\n", - " story = getStoryDetails(storyId)\n", + " for storyId in storyIds:\n", + " story = getStoryDetails(storyId)\n", "\n", - " if 'title' in story:\n", - " stories[storyId] = {\n", - " \"title\": story['title'],\n", - " \"time\": story['time'],\n", - " \"sentiment\": {\n", - " \"vader\": '',\n", - " \"nrclex\": {},\n", - " \"openai\": {}\n", - " }\n", - " }\n", - " \n", - " return stories\n", + " if \"title\" in story:\n", + " stories[storyId] = {\n", + " \"title\": story[\"title\"],\n", + " \"time\": story[\"time\"],\n", + " \"sentiment\": {\"vader\": \"\", \"nrclex\": {}, \"openai\": {}},\n", + " }\n", "\n", + " return stories\n", "\n", - "stories = getStories(getSampleStories())\n", + "\n", + "stories = getStories(sampleStories())\n", "\n", "for storyId, story in stories.items():\n", - " print(story['title'])" + " print(story[\"title\"])" ] }, { @@ -392,9 +435,9 @@ "outputs": [], "source": [ "def analyzeStories(stories):\n", - " for _, story in stories.items():\n", - " story['sentiment']['vader'] = analyzeSentiment(story['title'])\n", - " print(f\"{story['title']} ({story['sentiment']['vader']})\")\n", + " for _, story in stories.items():\n", + " story[\"sentiment\"][\"vader\"] = analyzeSentiment(story[\"title\"])\n", + " print(f\"{story['title']} ({story['sentiment']['vader']})\")\n", "\n", "\n", "analyzeStories(stories)" @@ -407,7 +450,9 @@ "source": [ "While this is easy enough to implement and might give us a general idea of the sentiment, what if we want to push things a little further?\n", "\n", - "What if we have more complex text to analyze or have content that VADER's training doesn't handle well?\n" + "What if we have more complex text to analyze or have content that VADER's training doesn't handle well?\n", + "\n", + "We could train our own model, but that's a lot of work.\n" ] }, { @@ -417,11 +462,11 @@ "source": [ "## ChatGPT\n", "\n", - "ChatGPT is a Large Language Model (LLM) that uses the Generative Pre-Trained Transformer (GPT) architecture to generate text based on prompts that we provide.\n", + "ChatGPT is an LLM that makes use of GPT architecture combined with [Instruction Tuning](https://openai.com/research/instruction-following) to follow instructions and generate text based on the prompts that we provide.\n", "\n", - "It's training data includes a whole bunch of stuff that we've all posted on the Internet over the years, as well as lots of other content that has been published.\n", + "It's training data includes a whole bunch of stuff that we've all posted on the Internet over the years, as well as lots of other content.\n", "\n", - "This vast trove of training data, combined with the power of the GPT architecture and the utility of Instruction Tuning, gives ChatGPT an impressive ability to respond to our requests for many tasks without needing to be retrained or fine-tuned.\n", + "This vast trove of training data, combined with the flexibility provided by it's architecture and tuning, gives ChatGPT an impressive ability to respond to our requests for many tasks without needing to be retrained or [fine-tuned](https://www.lakera.ai/insights/llm-fine-tuning-guide) for a specific task.\n", "\n", "### How ChatGPT works\n", "\n", @@ -443,34 +488,36 @@ "source": [ "import tiktoken\n", "\n", + "\n", "def tokenize(text):\n", - " tokens = []\n", - " ids = []\n", - " \n", - " # To get the tokeniser corresponding to a specific model in the OpenAI API:\n", - " encoding = tiktoken.encoding_for_model(modelDropdown.value)\n", + " tokens = []\n", + " ids = []\n", + "\n", + " # To get the tokeniser corresponding to a specific model in the OpenAI API:\n", + " encoding = tiktoken.encoding_for_model(modelDropdown.value)\n", + "\n", + " tokenized = encoding.encode(text)\n", "\n", - " tokenized = encoding.encode(text)\n", + " for tokenId in tokenized:\n", + " ids.append(tokenId)\n", + " tokens.append(encoding.decode_single_token_bytes(tokenId).decode(\"utf-8\"))\n", "\n", - " for tokenId in tokenized:\n", - " ids.append(tokenId)\n", - " tokens.append(encoding.decode_single_token_bytes(tokenId).decode('utf-8'))\n", + " return (tokens, ids)\n", "\n", - " return (tokens, ids)\n", "\n", "statements = [\n", - " 'I love that movie.',\n", - " 'I hate that movie.',\n", - " 'I like that movie.',\n", - " 'I dislike that movie.',\n", - " 'I saw that movie.',\n", + " \"I love that movie.\",\n", + " \"I hate that movie.\",\n", + " \"I like that movie.\",\n", + " \"I dislike that movie.\",\n", + " \"I saw that movie.\",\n", "]\n", "\n", "for statement in statements:\n", - " (statementTokens, statementIds) = tokenize(statement)\n", - " print(f\"{statementTokens} ({len(statementTokens)} tokens)\")\n", - " print(f\"{statementIds}\")\n", - " print('---')" + " (statementTokens, statementIds) = tokenize(statement)\n", + " print(f\"{statementTokens} ({len(statementTokens)} tokens)\")\n", + " print(f\"{statementIds}\")\n", + " print(\"---\")" ] }, { @@ -478,7 +525,7 @@ "id": "40566893", "metadata": {}, "source": [ - "We've wired the input below up to the same tokenizer function from above. Type in some text and see how the tokenizer responds.\n", + "We've wired the input below up to the same tokenizer function above. Type in some text and see how the tokenizer responds.\n", "\n", "There's also a great visualizer available at [https://gpt-tokenizer.dev/](https://gpt-tokenizer.dev/).\n" ] @@ -496,6 +543,8 @@ "outputs": [], "source": [ "# this code cell is just used to display a widget\n", + "# that uses the tokenize function we created\n", + "# you can just ignore/collapse it if you would prefer\n", "configureModel(modelDropdown.value)\n", "\n", "display(tokenAnalysisWidget)" @@ -508,11 +557,11 @@ "source": [ "## Prompt engineering\n", "\n", - "[Prompt engineering](https://en.wikipedia.org/wiki/Prompt_engineering) (or \"prompting\" if you are into the whole brevity thing) is the process of creating and testing instructions for the model (called \"prompts\") to find the right description that returns your desired results as often as possible and minimizes undesired results like [hallucinations]() and [apologies](https://news.ycombinator.com/item?id=36949931).\n", + "[Prompt engineering](https://en.wikipedia.org/wiki/Prompt_engineering) (or \"prompting\" if you are into the whole brevity thing) is the process of creating and testing instructions for the model (called \"prompts\") to find the most concise set of instructions that will guide the model towards returning your desired results as often as possible while minimizing undesired output like [hallucinations]() and [apologies](https://news.ycombinator.com/item?id=36949931).\n", "\n", - "In general, each message you send and each response that you receive become part of the overall prompt for the next message.\n", + "In general, each message you send and each response that you receive become part of the overall prompt for the next message, but there are strategies for managing a conversation's memory in order to selectively exclude messages that might lead to the model getting off track if repeated often enough.\n", "\n", - "You can think of the overall conversation as a document of text - it can help to imagine it as a screenplay.\n", + "You can think of the overall conversation as a document of text - it can help to imagine it as something like a [screenplay](https://en.wikipedia.org/wiki/Screenplay).\n", "\n", "There are various types of messages that make up this screenplay:\n", "\n", @@ -520,6 +569,8 @@ "- **User**: user messages are the individual prompts that the user sends to the model\n", "- **Assistant**: assistant messages are the responses the model generates to the user's prompts\n", "\n", + "### Example conversation document\n", + "\n", "The whole thing looks a bit like this:\n", "\n", "```\n", @@ -539,7 +590,7 @@ "or conducting a quick internet search for the most up-to-date information.\n", "```\n", "\n", - "**Note**: There are also messages with the type `function` that indicate that the Assistant would like to execute the function with the given name and pass it the given parameters, but for this demo, we'll be ignoring those.\n" + "**Note**: There are also messages with the type `function` that indicate that the Assistant would like to take advantage of [function calling](https://openai.com/blog/function-calling-and-other-api-updates) by asking the system to execute the function with the given name and pass it the given parameters, but for this demo, we'll be ignoring those.\n" ] }, { @@ -579,9 +630,10 @@ "Do not include any punctuation and only use lower case letters.\n", "\"\"\"\n", "\n", + "\n", "@backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", "def basicChatGptSentiment(prompt, model=modelDropdown.value):\n", - " messages = [{ \"role\": \"system\", \"content\": BASIC_SYSTEM_PROMPT }]\n", + " messages = [{\"role\": \"system\", \"content\": BASIC_SYSTEM_PROMPT}]\n", "\n", " messages.append({\"role\": \"user\", \"content\": prompt})\n", "\n", @@ -610,17 +662,17 @@ "outputs": [], "source": [ "if OPENAI_API_KEY:\n", - " for storyId, story in stories.items():\n", - " sentiment = basicChatGptSentiment(story['title'])\n", - " \n", - " if modelDropdown.value not in story['sentiment']['openai']:\n", - " story['sentiment']['openai'][modelDropdown.value] = {}\n", + " for storyId, story in stories.items():\n", + " sentiment = basicChatGptSentiment(story[\"title\"])\n", "\n", - " story['sentiment']['openai'][modelDropdown.value]['basic'] = sentiment\n", + " if modelDropdown.value not in story[\"sentiment\"][\"openai\"]:\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value] = {}\n", "\n", - " print(f\"{story['title']} ({sentiment})\")\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"basic\"] = sentiment\n", + "\n", + " print(f\"{story['title']} ({sentiment})\")\n", "else:\n", - " print('Please enter your OpenAI API key above and rerun this cell')" + " print(\"Please enter your OpenAI API key above and rerun this cell\")" ] }, { @@ -646,23 +698,27 @@ "source": [ "from nrclex import NRCLex\n", "\n", + "\n", "def getNRCEmotion(text):\n", - " emotion = NRCLex(text)\n", + " emotion = NRCLex(text)\n", + "\n", + " return emotion.top_emotions\n", "\n", - " return emotion.top_emotions\n", "\n", "for storyId, story in stories.items():\n", - " emotions = []\n", + " emotions = []\n", "\n", - " emotionAnalysis = getNRCEmotion(story['title'])\n", + " emotionAnalysis = getNRCEmotion(story[\"title\"])\n", "\n", - " for (emotion, value) in emotionAnalysis:\n", - " if value > 0.00:\n", - " emotions.append(emotion)\n", + " for emotion, value in emotionAnalysis:\n", + " if value > 0.00:\n", + " emotions.append(emotion)\n", "\n", - " story['sentiment']['nrclex'] = \", \".join(emotions)\n", + " story[\"sentiment\"][\"nrclex\"] = \", \".join(emotions)\n", "\n", - " print(f\"{story['title']} {('(' + ', '.join(emotions) + ')') if len(emotions) else ''}\")" + " print(\n", + " f\"{story['title']} {('(' + ', '.join(emotions) + ')') if len(emotions) else ''}\"\n", + " )" ] }, { @@ -670,9 +726,7 @@ "id": "740845f9", "metadata": {}, "source": [ - "But, with how short some of our titles are, it doesn't always seem to get good results and it seems like sometimes it disagrees with the VADER sentiment analysis, too.\n", - "\n", - "We could try to train a model to do this for us, but that would be a lot of work and we'd need a lot of data to train it on.\n", + "But, with how short some of our titles can be, it doesn't always seem to get good results and it seems like sometimes it disagrees with the VADER sentiment analysis.\n", "\n", "Luckily, we can pretty easily adapt our initial prompt to get ChatGPT to do this for us, too.\n" ] @@ -691,14 +745,17 @@ "\n", "1. Analyze the prompt for relevant emotion, tone, affinity, sarcasm, irony, etc.\n", "2. Analyze the likely emotional state of the author based on those findings\n", - "3. Summarize the emotional state and sentiment of the prompt based on your findings using 5 or less names for emotions using lowercase letters and separating each emotional state with a comma\n", + "3. Summarize the emotional state and sentiment of the prompt based on your findings with at least 2, but no more than 5 names for emotions\n", "\n", "Only return the output from the final step to the user.\n", + "\n", + "Only respond with lowercase letters and separate each emotion with a comma and a space\n", "\"\"\"\n", "\n", + "\n", "@backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", "def advancedChatGptSentiment(prompt, model=modelDropdown.value):\n", - " messages = [{ \"role\": \"system\", \"content\": ADVANCED_SYSTEM_PROMPT }]\n", + " messages = [{\"role\": \"system\", \"content\": ADVANCED_SYSTEM_PROMPT}]\n", "\n", " messages.append({\"role\": \"user\", \"content\": prompt})\n", "\n", @@ -727,17 +784,17 @@ "outputs": [], "source": [ "if OPENAI_API_KEY:\n", - " for storyId, story in stories.items():\n", - " sentiment = advancedChatGptSentiment(story['title'])\n", + " for storyId, story in stories.items():\n", + " sentiment = advancedChatGptSentiment(story[\"title\"])\n", "\n", - " if modelDropdown.value not in story['sentiment']['openai']:\n", - " story['sentiment']['openai'][modelDropdown.value] = {}\n", + " if modelDropdown.value not in story[\"sentiment\"][\"openai\"]:\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value] = {}\n", "\n", - " story['sentiment']['openai'][modelDropdown.value]['advanced'] = sentiment\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"advanced\"] = sentiment\n", "\n", - " print(f\"{story['title']} ({sentiment})\")\n", + " print(f\"{story['title']} ({sentiment})\")\n", "else:\n", - " print('Please enter your OpenAI API key above and rerun this cell')" + " print(\"Please enter your OpenAI API key above and rerun this cell\")" ] }, { @@ -767,6 +824,7 @@ "# this code cell is just used to display a widget\n", "# that uses the analyzeSentiment function we created\n", "# as well as the advancedChatGptSentiment function\n", + "# you can just ignore/collapse it if you would prefer\n", "configureOpenAi(OPENAI_API_KEY, modelDropdown.value, TEMPERATURE)\n", "\n", "display(advancedAnalysisWidget)" @@ -798,15 +856,18 @@ "\n", "1. Analyze the prompt for relevant emotion, tone, affinity, sarcasm, irony, etc.\n", "2. Analyze the likely emotional state of the author based on those findings\n", - "3. Summarize the emotional state and sentiment of the prompt based on your findings into 5 or fewer names for emotional states\n", - "4. Convert each of the emotional states you identified into a representative emoji\n", + "3. Summarize the emotional state and sentiment of the prompt based on your findings with at least 2, but no more than 5 names for emotions\n", + "4. Convert each of the emotional states you identified into a representative emoji or summarize the overall collection of states with a single emoji if there is one that captures it well\n", "\n", "Only return the output from the final step to the user.\n", + "\n", + "Only repsond with emojis.\n", "\"\"\"\n", "\n", + "\n", "@backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", "def emojiChatGptSentiment(prompt, model=modelDropdown.value):\n", - " messages = [{ \"role\": \"system\", \"content\": EMOJI_SYSTEM_PROMPT }]\n", + " messages = [{\"role\": \"system\", \"content\": EMOJI_SYSTEM_PROMPT}]\n", "\n", " messages.append({\"role\": \"user\", \"content\": prompt})\n", "\n", @@ -835,17 +896,17 @@ "outputs": [], "source": [ "if OPENAI_API_KEY:\n", - " for storyId, story in stories.items():\n", - " sentiment = emojiChatGptSentiment(story['title'])\n", + " for storyId, story in stories.items():\n", + " sentiment = emojiChatGptSentiment(story[\"title\"])\n", "\n", - " if modelDropdown.value not in story['sentiment']['openai']:\n", - " story['sentiment']['openai'][modelDropdown.value] = {}\n", + " if modelDropdown.value not in story[\"sentiment\"][\"openai\"]:\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value] = {}\n", "\n", - " story['sentiment']['openai'][modelDropdown.value]['emoji'] = sentiment\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"emoji\"] = sentiment\n", "\n", - " print(f\"{story['title']}({sentiment})\")\n", + " print(f\"{story['title']}({sentiment})\")\n", "else:\n", - " print('Please enter your OpenAI API key above and rerun this cell')" + " print(\"Please enter your OpenAI API key above and rerun this cell\")" ] }, { @@ -855,14 +916,16 @@ "source": [ "## Prompting strategies\n", "\n", - "In the previous examples we've been using [zero shot](https://www.promptingguide.ai/techniques/zeroshot) prompting, which means we're asking the model to repsond without giving it an example of what kind of response we'd like for it to have.\n", + "In the previous examples we've been using [Zero Shot](https://www.promptingguide.ai/techniques/zeroshot) prompting, which means we're asking the model to repsond without giving it an example of what kind of response we'd like for it to have.\n", "\n", "There are other prompting strategies we can employ, though:\n", "\n", - "- **One shot**: gives the model a single example of how we'd like it to respond to guide it's output\n", - "- [**Few shot**](https://www.promptingguide.ai/techniques/fewshot): gives the model a few examples of how we'd like it to respond to different prompts to help guide it's output\n", + "- **One Shot**: gives the model a single example of how we'd like it to respond to guide it's output; this is useful for situations where the model needs a little guidance, but we don't wnat to interfere with how it performs on other tasks\n", + "- [**Few Shot**](https://www.promptingguide.ai/techniques/fewshot): gives the model a few examples of how we'd like it to respond to different prompts to help guide it's output; this is useful for situations where the model is doing something novel and needs more guidance, and we're going to be mostly focusing on asking the model to perform the task that we're providing examples for\n", "\n", - "### Learn more\n", + "**Note**: For other types of tasks there are various prompting strategies that can be useful, like [Chain of Thought Reasoning](https://www.promptingguide.ai/techniques/cot), [Directional Stimulus Prompting](https://www.promptingguide.ai/techniques/dsp), and even telling the model to [take a deep breath](https://arstechnica.com/information-technology/2023/09/telling-ai-model-to-take-a-deep-breath-causes-math-scores-to-soar-in-study/) can help it do math.\n", + "\n", + "### Learn more about prompting strategies\n", "\n", "- [Prompt Engineering Guide](https://www.promptingguide.ai/)\n", "- [Master Prompting Concepts: Zero-Shot and Few-Shot Prompting](https://www.promptengineering.org/master-prompting-concepts-zero-shot-and-few-shot-prompting/)\n", @@ -877,8 +940,6 @@ "source": [ "### One shot prompting\n", "\n", - "One shot prompting can be helpful for a slighlty novel task that the model can perform, but doens't always perform adequately.\n", - "\n", "Providing a single example of the desired output can help with things like proper formatting and refine the quality of the model's output.\n" ] }, @@ -891,16 +952,19 @@ "source": [ "# Grabbed from https://news.ycombinator.com/ at 2023-09-20 13:00 EDT\n", "# Reference: https://news.ycombinator.com/item?id=37598299\n", - "ONE_SHOT_USER_EXAMPLE = \"Cisco pulled out of the SentinelOne acquisition after due dilligence\"\n", + "ONE_SHOT_USER_EXAMPLE = (\n", + " \"Cisco pulled out of the SentinelOne acquisition after due dilligence\"\n", + ")\n", "\n", "ONE_SHOT_BOT_EXAMPLE = \"๐Ÿคจ\"\n", "\n", + "\n", "@backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", "def oneShotChatGptSentiment(prompt, model=modelDropdown.value):\n", " messages = [\n", - " { \"role\": \"system\", \"content\": EMOJI_SYSTEM_PROMPT },\n", - " {\"role\": \"user\", \"content\": ONE_SHOT_USER_EXAMPLE },\n", - " {\"role\": \"assistant\", \"content\": ONE_SHOT_BOT_EXAMPLE }\n", + " {\"role\": \"system\", \"content\": EMOJI_SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": ONE_SHOT_USER_EXAMPLE},\n", + " {\"role\": \"assistant\", \"content\": ONE_SHOT_BOT_EXAMPLE},\n", " ]\n", "\n", " messages.append({\"role\": \"user\", \"content\": prompt})\n", @@ -914,6 +978,14 @@ " return response.choices[0].message[\"content\"]" ] }, + { + "cell_type": "markdown", + "id": "bea18c0d", + "metadata": {}, + "source": [ + "Let's apply this to our Hacker News stories from earlier and see how it changes the results.\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -922,17 +994,17 @@ "outputs": [], "source": [ "if OPENAI_API_KEY:\n", - " for storyId, story in stories.items():\n", - " sentiment = oneShotChatGptSentiment(story['title'])\n", + " for storyId, story in stories.items():\n", + " sentiment = oneShotChatGptSentiment(story[\"title\"])\n", "\n", - " if modelDropdown.value not in story['sentiment']['openai']:\n", - " story['sentiment']['openai'][modelDropdown.value] = {}\n", + " if modelDropdown.value not in story[\"sentiment\"][\"openai\"]:\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value] = {}\n", "\n", - " story['sentiment']['openai'][modelDropdown.value]['oneshot'] = sentiment\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"oneshot\"] = sentiment\n", "\n", - " print(f\"{story['title']}({sentiment})\")\n", + " print(f\"{story['title']}({sentiment})\")\n", "else:\n", - " print('Please enter your OpenAI API key above and rerun this cell')" + " print(\"Please enter your OpenAI API key above and rerun this cell\")" ] }, { @@ -942,11 +1014,9 @@ "source": [ "### Few shot prompting\n", "\n", - "Few shot prompting is great for novel tasks where you want to guide the model's output, but don't have the resources to fine tune the model for the task.\n", - "\n", "Providing a few examples of desired responses can give the model a chance to learn how you'd like it to respond.\n", "\n", - "**Note**: Few shot prompting can also lead to issues where the model doesn't respond as creatively or won't perform as well on other tasks.\n" + "**Note**: Few shot prompting can also lead to issues where the model doesn't respond as creatively or won't perform as well on other tasks, which can be great for certain use cases, but might require a higher temperature setting for others.\n" ] }, { @@ -958,26 +1028,27 @@ "source": [ "# Grabbed from https://news.ycombinator.com/ at 2023-09-20 13:10 EDT\n", "FEW_SHOT_USER_EXAMPLES = [\n", - " ONE_SHOT_USER_EXAMPLE,\n", - " # Reference: https://news.ycombinator.com/item?id=37595898\n", - " \"Atlassian cripples Jira automation for all but enterprise customers\",\n", - " # Reference: https://news.ycombinator.com/item?id=37586264\n", - " \"Toyota Research claims breakthrough in teaching robots new behaviors\"\n", + " ONE_SHOT_USER_EXAMPLE,\n", + " # Reference: https://news.ycombinator.com/item?id=37595898\n", + " \"Atlassian cripples Jira automation for all but enterprise customers\",\n", + " # Reference: https://news.ycombinator.com/item?id=37586264\n", + " \"Toyota Research claims breakthrough in teaching robots new behaviors\",\n", "]\n", "\n", "FEW_SHOT_BOT_EXAMPLES = [\n", - " ONE_SHOT_BOT_EXAMPLE,\n", - " \"๐Ÿ˜–\",\n", - " \"๐Ÿ‘\",\n", + " ONE_SHOT_BOT_EXAMPLE,\n", + " \"๐Ÿ˜–\",\n", + " \"๐Ÿ‘\",\n", "]\n", "\n", + "\n", "@backoff.on_exception(backoff.expo, openai.error.RateLimitError)\n", "def fewShotChatGptSentiment(prompt, model=modelDropdown.value):\n", - " messages = [{ \"role\": \"system\", \"content\": EMOJI_SYSTEM_PROMPT }]\n", + " messages = [{\"role\": \"system\", \"content\": EMOJI_SYSTEM_PROMPT}]\n", "\n", " for i, userExample in enumerate(FEW_SHOT_USER_EXAMPLES):\n", - " messages.append({\"role\": \"user\", \"content\": userExample })\n", - " messages.append({\"role\": \"assistant\", \"content\": FEW_SHOT_BOT_EXAMPLES[i] })\n", + " messages.append({\"role\": \"user\", \"content\": userExample})\n", + " messages.append({\"role\": \"assistant\", \"content\": FEW_SHOT_BOT_EXAMPLES[i]})\n", "\n", " messages.append({\"role\": \"user\", \"content\": prompt})\n", "\n", @@ -995,7 +1066,7 @@ "id": "3bd690af", "metadata": {}, "source": [ - "Let's apply this to our Hacker News stories from earlier.\n" + "Let's apply this to our Hacker News stories from earlier and see how it changes the results.\n" ] }, { @@ -1006,17 +1077,17 @@ "outputs": [], "source": [ "if OPENAI_API_KEY:\n", - " for storyId, story in stories.items():\n", - " sentiment = fewShotChatGptSentiment(story['title'])\n", + " for storyId, story in stories.items():\n", + " sentiment = fewShotChatGptSentiment(story[\"title\"])\n", "\n", - " if modelDropdown.value not in story['sentiment']['openai']:\n", - " story['sentiment']['openai'][modelDropdown.value] = {}\n", + " if modelDropdown.value not in story[\"sentiment\"][\"openai\"]:\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value] = {}\n", "\n", - " story['sentiment']['openai'][modelDropdown.value]['fewshot'] = sentiment\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"fewshot\"] = sentiment\n", "\n", - " print(f\"{story['title']} ({sentiment})\")\n", + " print(f\"{story['title']} ({sentiment})\")\n", "else:\n", - " print('Please enter your OpenAI API key above and rerun this cell')" + " print(\"Please enter your OpenAI API key above and rerun this cell\")" ] }, { @@ -1031,6 +1102,16 @@ "Let's compare the results of each analysis.\n" ] }, + { + "cell_type": "markdown", + "id": "5d4023ba", + "metadata": {}, + "source": [ + "### Gathering our data\n", + "\n", + "We'll start by mapping our data into a format that is easier to display with [DataFrames](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) provided by the [`pandas`](https://pandas.pydata.org/) library.\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1047,41 +1128,54 @@ "import pandas as pd\n", "\n", "sentimentData = {\n", - " \"Story\": [],\n", - " \"VADER\": [],\n", - " \"NRC\": [],\n", - " \"ChatGPT (Sentiment)\": [],\n", - " \"ChatGPT (Emotion)\": [],\n", - " \"Zero Shot\": [],\n", - " \"One Shot\": [],\n", - " \"Few Shot\": [],\n", + " \"Story\": [],\n", + " \"VADER\": [],\n", + " \"NRC\": [],\n", + " \"ChatGPT (Sentiment)\": [],\n", + " \"ChatGPT (Emotion)\": [],\n", + " \"Zero Shot\": [],\n", + " \"One Shot\": [],\n", + " \"Few Shot\": [],\n", "}\n", "\n", "for storyId, story in stories.items():\n", - " if 'title' in story:\n", - " sentimentData[\"Story\"].append(story['title'])\n", - " \n", - " if 'vader' in story['sentiment']:\n", - " sentimentData[\"VADER\"].append(story['sentiment']['vader'])\n", - " \n", - " if 'nrclex' in story['sentiment']:\n", - " sentimentData[\"NRC\"].append(story['sentiment']['nrclex'])\n", - "\n", - " if 'openai' in story['sentiment'] and modelDropdown.value in story['sentiment']['openai']:\n", - " if 'basic' in story['sentiment']['openai'][modelDropdown.value]:\n", - " sentimentData[\"ChatGPT (Sentiment)\"].append(story['sentiment']['openai'][modelDropdown.value]['basic'])\n", - "\n", - " if 'advanced' in story['sentiment']['openai'][modelDropdown.value]:\n", - " sentimentData[\"ChatGPT (Emotion)\"].append(story['sentiment']['openai'][modelDropdown.value]['advanced'])\n", - "\n", - " if 'emoji' in story['sentiment']['openai'][modelDropdown.value]:\n", - " sentimentData[\"Zero Shot\"].append(story['sentiment']['openai'][modelDropdown.value]['emoji'])\n", - "\n", - " if 'oneshot' in story['sentiment']['openai'][modelDropdown.value]:\n", - " sentimentData[\"One Shot\"].append(story['sentiment']['openai'][modelDropdown.value]['oneshot'])\n", - "\n", - " if 'fewshot' in story['sentiment']['openai'][modelDropdown.value]:\n", - " sentimentData[\"Few Shot\"].append(story['sentiment']['openai'][modelDropdown.value]['fewshot'])" + " if \"title\" in story:\n", + " sentimentData[\"Story\"].append(story[\"title\"])\n", + "\n", + " if \"vader\" in story[\"sentiment\"]:\n", + " sentimentData[\"VADER\"].append(story[\"sentiment\"][\"vader\"])\n", + "\n", + " if \"nrclex\" in story[\"sentiment\"]:\n", + " sentimentData[\"NRC\"].append(story[\"sentiment\"][\"nrclex\"])\n", + "\n", + " if (\n", + " \"openai\" in story[\"sentiment\"]\n", + " and modelDropdown.value in story[\"sentiment\"][\"openai\"]\n", + " ):\n", + " if \"basic\" in story[\"sentiment\"][\"openai\"][modelDropdown.value]:\n", + " sentimentData[\"ChatGPT (Sentiment)\"].append(\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"basic\"]\n", + " )\n", + "\n", + " if \"advanced\" in story[\"sentiment\"][\"openai\"][modelDropdown.value]:\n", + " sentimentData[\"ChatGPT (Emotion)\"].append(\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"advanced\"]\n", + " )\n", + "\n", + " if \"emoji\" in story[\"sentiment\"][\"openai\"][modelDropdown.value]:\n", + " sentimentData[\"Zero Shot\"].append(\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"emoji\"]\n", + " )\n", + "\n", + " if \"oneshot\" in story[\"sentiment\"][\"openai\"][modelDropdown.value]:\n", + " sentimentData[\"One Shot\"].append(\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"oneshot\"]\n", + " )\n", + "\n", + " if \"fewshot\" in story[\"sentiment\"][\"openai\"][modelDropdown.value]:\n", + " sentimentData[\"Few Shot\"].append(\n", + " story[\"sentiment\"][\"openai\"][modelDropdown.value][\"fewshot\"]\n", + " )" ] }, { @@ -1106,9 +1200,30 @@ "outputs": [], "source": [ "# this cell is only used to display a dataframe of our sentiment analysis results\n", - "sentimentDataFrame = pd.DataFrame(data=sentimentData, columns=[\"Story\", \"VADER\", \"ChatGPT (Sentiment)\"], )\n", - "\n", - "display(sentimentDataFrame)" + "try:\n", + " if checkArrayLengths(\n", + " sentimentData[\"Story\"],\n", + " sentimentData[\"VADER\"],\n", + " sentimentData[\"ChatGPT (Sentiment)\"],\n", + " ):\n", + " sentimentDataFrame = pd.DataFrame(\n", + " data=sentimentData,\n", + " columns=[\"Story\", \"VADER\", \"ChatGPT (Sentiment)\"],\n", + " )\n", + "\n", + " display(\n", + " sentimentDataFrame\n", + " if STORY_SAMPLE_SIZE <= DATAFRAME_LIMIT\n", + " else sentimentDataFrame.head(DATAFRAME_LIMIT)\n", + " )\n", + " else:\n", + " print(\n", + " \"Error: Different number of stories and sentiment results. Please rerun the VADER, Basic ChatGPT Example, and Gathering Our Data cells above and then rerun this cell.\"\n", + " )\n", + "except NameError:\n", + " print(\n", + " \"Error: No sentiment data to display. Please rerun the Gathering Our Data cell above and then rerun this cell.\"\n", + " )" ] }, { @@ -1133,12 +1248,30 @@ "outputs": [], "source": [ "# this code cell is only used to display a dataframe with our emotional analysis results\n", - "emotionDataFrame = pd.DataFrame(data=sentimentData, columns=[\"Story\", \"NRC\", \"ChatGPT (Emotion)\"])\n", - "\n", - "# often NRCLex will not have data and instead of displaying NaN we'll leave it blank\n", - "emotionDataFrame = emotionDataFrame.fillna('')\n", - "\n", - "display(emotionDataFrame)" + "try:\n", + " if checkArrayLengths(\n", + " sentimentData[\"Story\"], sentimentData[\"NRC\"], sentimentData[\"ChatGPT (Emotion)\"]\n", + " ):\n", + " emotionDataFrame = pd.DataFrame(\n", + " data=sentimentData, columns=[\"Story\", \"NRC\", \"ChatGPT (Emotion)\"]\n", + " )\n", + "\n", + " # often NRCLex will not have data and instead of displaying NaN we'll leave it blank\n", + " emotionDataFrame = emotionDataFrame.fillna(\"\")\n", + "\n", + " display(\n", + " emotionDataFrame\n", + " if STORY_SAMPLE_SIZE <= DATAFRAME_LIMIT\n", + " else emotionDataFrame.head(DATAFRAME_LIMIT)\n", + " )\n", + " else:\n", + " print(\n", + " \"Error: Different number of stories and sentiment results. Please rerun the NRCLex, Advanced ChatGPT Example, and Gathering Our Data cells above and then rerun this cell.\"\n", + " )\n", + "except NameError:\n", + " print(\n", + " \"Error: No emotion data to display. Please rerun the Gathering Our Data cell above and then rerun this cell.\"\n", + " )" ] }, { @@ -1163,9 +1296,30 @@ "outputs": [], "source": [ "# this cell is just used to display a dataframe with our emoji results\n", - "emojiDataFrame = pd.DataFrame(data=sentimentData, columns=[\"Story\", \"Zero Shot\", \"One Shot\", \"Few Shot\"])\n", - "\n", - "display(emojiDataFrame)" + "try:\n", + " if checkArrayLengths(\n", + " sentimentData[\"Story\"],\n", + " sentimentData[\"Zero Shot\"],\n", + " sentimentData[\"One Shot\"],\n", + " sentimentData[\"Few Shot\"],\n", + " ):\n", + " emojiDataFrame = pd.DataFrame(\n", + " data=sentimentData, columns=[\"Story\", \"Zero Shot\", \"One Shot\", \"Few Shot\"]\n", + " )\n", + "\n", + " display(\n", + " emojiDataFrame\n", + " if STORY_SAMPLE_SIZE <= DATAFRAME_LIMIT\n", + " else emojiDataFrame.head(DATAFRAME_LIMIT)\n", + " )\n", + " else:\n", + " print(\n", + " \"Error: Different number of stories and emoji results. Please rerun the Emjoji Classifier, One Shot, Few Shot, and Gathering Our Data cells above and then rerun this cell.\"\n", + " )\n", + "except NameError:\n", + " print(\n", + " \"Error: No emoji data to display. Please rerun the Gathering Our Data cell above and then rerun this cell.\"\n", + " )" ] }, { @@ -1175,7 +1329,7 @@ "source": [ "## Conclusion\n", "\n", - "NLP tasks like sentiment analyis used to required significant resources and time, but with the advent of LLMs like ChatGPT, we can quickly perform analyses and teach models to perform novel tasks.\n" + "NLP tasks like sentiment analyis used to required significant resources and time, but with the advent of LLMs like ChatGPT and the continued discovery of new prompting strategies to guide these models we can quickly perform complex NLP analyses and teach models to perform novel tasks.\n" ] } ], diff --git a/utils/array.py b/utils/array.py new file mode 100644 index 0000000..2d83128 --- /dev/null +++ b/utils/array.py @@ -0,0 +1,7 @@ +def checkArrayLengths(*args): + lengths = [len(arg) for arg in args] + + if len(set(lengths)) != 1: + raise ValueError("Arrays have different lengths: %s" % lengths) + + return True diff --git a/widgets/simple.py b/widgets/simple.py index 93cfe98..1a276ac 100644 --- a/widgets/simple.py +++ b/widgets/simple.py @@ -6,30 +6,31 @@ # instantiate the sentiment analyzer analyzer = SentimentIntensityAnalyzer() -# analyze the sentiment of a string of text -def analyzeSentiment(text): - if not text: - return('') +def convertSentimentToLabel(sentiment): + sentimentScore = sentiment['compound'] - # use VADER to get the +/- sentiment of the string - sentiment = analyzer.polarity_scores(text) - - # map the sentiment to a human readable label - if sentiment['compound'] >= 0.75: + if sentimentScore >= 0.75: return('very positive') - elif sentiment['compound'] >= 0.4: + elif sentimentScore >= 0.4: return('positive') - elif sentiment['compound'] >= 0.1: + elif sentimentScore >= 0.1: return('leaning positive') - elif sentiment['compound'] <= -0.1 and sentiment['compound'] > -0.4: + elif sentimentScore <= -0.1 and sentimentScore > -0.4: return('leaning negative') - elif sentiment['compound'] <= -0.4 and sentiment['compound'] > -0.75: + elif sentimentScore <= -0.4 and sentimentScore > -0.75: return('negative') - elif sentiment['compound'] <= -0.75: + elif sentimentScore <= -0.75: return('very negative') else: return('neutral') + +def analyzeSentiment(text): + if not text: + return('') + + return analyzer.polarity_scores(text) + def getSentiment(change): # Get the sentiment sentiment = analyzeSentiment(change['new'].strip()) @@ -37,7 +38,7 @@ def getSentiment(change): if sentiment: with analysis: analysis.clear_output(wait=True) - print(sentiment) + print(convertSentimentToLabel(sentiment)) else: analysis.clear_output()