Speech Recognition - Data Science and the Humanities - 2024.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.4.550">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">

<meta name="author" content="Peter Gilles">
<meta name="dcterms.date" content="2024-04-24">

<title>Automatic Speech recognition of Luxembourgish</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
  vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
  }
pre.numberSource { margin-left: 3em;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
/* CSS for citations */
div.csl-bib-body { }
div.csl-entry {
  clear: both;
  margin-bottom: 0em;
}
.hanging-indent div.csl-entry {
  margin-left:2em;
  text-indent:-2em;
}
div.csl-left-margin {
  min-width:2em;
  float:left;
}
div.csl-right-inline {
  margin-left:2em;
  padding-left:1em;
}
div.csl-indent {
  margin-left: 2em;
}</style>


<script src="Speech Recognition - Data Science and the Humanities - 2024_files/libs/clipboard/clipboard.min.js"></script>
<script src="Speech Recognition - Data Science and the Humanities - 2024_files/libs/quarto-html/quarto.js"></script>
<script src="Speech Recognition - Data Science and the Humanities - 2024_files/libs/quarto-html/popper.min.js"></script>
<script src="Speech Recognition - Data Science and the Humanities - 2024_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="Speech Recognition - Data Science and the Humanities - 2024_files/libs/quarto-html/anchor.min.js"></script>
<link href="Speech Recognition - Data Science and the Humanities - 2024_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="Speech Recognition - Data Science and the Humanities - 2024_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="Speech Recognition - Data Science and the Humanities - 2024_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="Speech Recognition - Data Science and the Humanities - 2024_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="Speech Recognition - Data Science and the Humanities - 2024_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">


</head>

<body>

<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="index.html"><i class="bi bi-file-slides"></i>RevealJS</a></li></ul></div></div>
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Automatic Speech recognition of Luxembourgish</h1>
<p class="subtitle lead">Lecture ‘Data Science in the Humanities’</p>
</div>


<div class="quarto-title-meta">

    <div>
    <div class="quarto-title-meta-heading">Author</div>
    <div class="quarto-title-meta-contents">
             <p>Peter Gilles </p>
          </div>
  </div>
    
    <div>
    <div class="quarto-title-meta-heading">Published</div>
    <div class="quarto-title-meta-contents">
      <p class="date">April 24, 2024</p>
    </div>
  </div>
  
    
  </div>
  

</header>


<section id="structure-of-the-lecture" class="level2">
<h2 class="anchored" data-anchor-id="structure-of-the-lecture">Structure of the lecture</h2>
<ol type="1">
<li>Introduction</li>
<li>Dealing with audio data</li>
<li>Basics of ASR</li>
<li>ASR for a small language like Luxembourgish</li>
<li>Meta’s wav2vec 2.0</li>
<li>OpenAi’s Whisper</li>
<li>Conclusion</li>
</ol>
</section>
<section id="introdcution-history-of-automatic-speech-recognition" class="level2">
<h2 class="anchored" data-anchor-id="introdcution-history-of-automatic-speech-recognition">Introdcution: History of Automatic Speech Recognition</h2>
<div class="columns">
<div class="column" style="width:60%;">
<ul>
<li>Euphoric phase in the 1950s at Bell Labs, IBM</li>
<li>Hidden Markov Models (HMM)
<ul>
<li>predict the probability of a sound or a word</li>
<li>not based on acoustic features alone</li>
</ul></li>
<li>1980s: Neural Networks
<ul>
<li>Loads of annotated training material needed (based on word or even phonetic segment)</li>
<li>Only for one specific language</li>
<li>Highly task and speaker dependent</li>
</ul></li>
<li>2010s: Deep Neural Networks / Transformer Models</li>
</ul>
</div><div class="column" style="width:40%;">
<div class="quarto-figure quarto-figure-left">
<figure class="figure">
<p><img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/629863a6-644e-4b31-902a-1f409e8fa497-1-513bd879-1536-4b1d-83d1-dad5da451ecf.png" class="img-fluid quarto-figure quarto-figure-left figure-img" style="width:60.0%"></p>
</figure>
</div>
</div>
</div>
</section>
<section id="intro-applications-of-asr" class="level2">
<h2 class="anchored" data-anchor-id="intro-applications-of-asr">Intro: Applications of ASR</h2>
<ul>
<li>Real-time transcription
<ul>
<li>dictation</li>
<li>subtitling of videos</li>
</ul></li>
<li>Transcription of audio recordings
<ul>
<li>Speeches, debates at the parliament, podcasts, videos etc.</li>
</ul></li>
<li>Audio commands
<ul>
<li>Siri, Alexa</li>
</ul></li>
<li>Keyword detection (i.e.&nbsp;for call centers)</li>
<li>Speaker verification / identification</li>
<li>Phonetic segmentation of audio recordings (‘Alignment’) for linguistic studies</li>
<li>Emotion detection</li>
<li>AI interactions
<ul>
<li>chatbots</li>
<li>Q&amp;A</li>
</ul></li>
</ul>
</section>
<section id="dealing-with-audio-data" class="level2">
<h2 class="anchored" data-anchor-id="dealing-with-audio-data">Dealing with audio data</h2>
<ul>
<li><p>Recommended overview: <span class="citation" data-cites="huiSpeechRecognitionPhonetics2022">Hui (<a href="#ref-huiSpeechRecognitionPhonetics2022" role="doc-biblioref">2022</a>)</span></p></li>
<li><p>Acoustic representation of speech: continuous signal with amplitude modulation in time <img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/180f0b35-9386-49bf-8e4b-5459668ce62c-1-1628698b-05e0-433b-93c3-b80c07282edb.png" class="img-fluid" alt="waveform"></p></li>
<li><p>Incredibly variable, due to …</p>
<ul>
<li>accent</li>
<li>speaking style</li>
<li>speaker-dependent characteristics</li>
<li>recording conditions</li>
</ul></li>
<li><p>Basic acoustic representation for speech: Spectrogram</p>
<ul>
<li>x: time</li>
<li>y: frequency</li>
<li>color: intensity</li>
</ul></li>
</ul>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/180f0b35-9386-49bf-8e4b-5459668ce62c-2-9634cd64-767d-4da9-a152-75055119c352.png" class="img-fluid figure-img"></p>
<figcaption>spectrogram</figcaption>
</figure>
</div>
</section>
<section id="dealing-with-audio-data-1" class="level2">
<h2 class="anchored" data-anchor-id="dealing-with-audio-data-1">Dealing with audio data</h2>
<ul>
<li>Processing unit for ASR:
<ul>
<li>frames with 25ms width</li>
<li>10ms overlap</li>
<li>capture all relevant linguistic information</li>
<li>feature extraction: Mel-frequency cepstral coefficients (MFCC) or Mel spectrogram</li>
<li>39 features per frame, stored as vector <img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/a382c2c7-ec76-4d96-b016-1fb2b4d507cb-1-38aba670-424c-4143-843e-97113c382d4b.png" class="img-fluid" alt="from Hui (2019)"></li>
</ul></li>
</ul>
</section>
<section id="basics-of-automatic-speech-recognition" class="level2">
<h2 class="anchored" data-anchor-id="basics-of-automatic-speech-recognition">Basics of Automatic Speech Recognition</h2>
<ul>
<li>Main task of ASR
<ul>
<li>Use a statistical model to recognise the correct text from a sequence of feature vectors</li>
</ul></li>
<li>Design of a traditional ASR system <img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/e8ddec01-3648-4295-84ea-60c944da1611-1-72475551-1146-48be-9f0b-d0a17d290b2e.png" class="img-fluid" style="width:80.0%"></li>
<li>Necessary ingredients for the model
<ul>
<li>acoustic model based on fine-grained annotated speech</li>
<li>pronunciation lexicon</li>
<li>language model</li>
</ul></li>
</ul>
</section>
<section id="basics-of-automatic-speech-recognition-1" class="level2">
<h2 class="anchored" data-anchor-id="basics-of-automatic-speech-recognition-1">Basics of Automatic Speech Recognition</h2>
<ul>
<li><p>The rise of Deep Learning (DL) <img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/643d0c9c-0319-44a4-88d5-2277d24e6db1-1-ae1b1b1c-a4f3-4cb2-b5fa-1cd2da84e6d0.png" class="img-fluid" alt="Boigne (2021)"></p></li>
<li><p>Transformer models</p>
<ul>
<li>accounting for context in a broad and flexible way</li>
<li>abstract and generalized representations of the speech signal that capture its relevant features, such as phonetic, spectral, or contextual information</li>
<li>generated in a Neural Network</li>
</ul></li>
<li><p>End-to-end models</p>
<ul>
<li>no intermediate steps necessary, e.g.&nbsp;phoneme detection, phoneme2grapheme conversion, pronunciation dictionary etc.</li>
</ul></li>
</ul>
</section>
<section id="asr-for-low-resource-languages" class="level2">
<h2 class="anchored" data-anchor-id="asr-for-low-resource-languages">ASR for low-resource languages</h2>
<div class="columns">
<div class="column" style="width:60%;">
<ul>
<li>Challenges of Luxembourgish
<ul>
<li>paucity of training data</li>
<li>multilingual context: words and phrases from French and German</li>
<li>lots of spelling variation</li>
<li>costly to compile training data<br>
</li>
</ul></li>
</ul>
</div><div class="column" style="width:40%;">
<p><img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/5564cf63-c0d0-4024-bd4a-b71404a88caa-1-6a0f8388-2ce8-470d-9051-4ff34109c304.png" class="img-fluid"></p>
</div>
</div>
</section>
<section id="compilation-of-training-data" class="level2">
<h2 class="anchored" data-anchor-id="compilation-of-training-data">Compilation of training data</h2>
<ul>
<li>Algorithms will ‘learn’ from example
<ul>
<li>pairs of text transcription and matching audio</li>
<li>compilation of audio samples of 2 to 20 sec length</li>
<li>multiple speakers</li>
<li>various speaking genres (debates in the parliament, media recordings, podcasts, interviews, elicited sentences etc.)</li>
<li>meticulous checks of the data</li>
<li>atm 40,000 samples / 70 hours of training data</li>
</ul></li>
<li>Training data shall contain as many sound and word combinations as possible</li>
</ul>
<table class="table">
<colgroup>
<col style="width: 54%">
<col style="width: 45%">
</colgroup>
<thead>
<tr class="header">
<th>Transcription</th>
<th>Audio</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Luc hat eng Frëndin, déi lieft a Corse.</td>
<td><audio src="https://luxappsdata.uni.lu/schnessen/media/recording_recordings/2018-08-06/pg_191_3314_p1rzb.wav" controls=""> </audio></td>
</tr>
<tr class="even">
<td>hei iwwer déi Datumen ze schwätzen. Ech wëll just soen, datt ech e Rapport vum 16.</td>
<td><audio src="Chamber_2020_10_29_1917.wav" controls=""> </audio></td>
</tr>
<tr class="odd">
<td>Entschëllegt, et ass déi, vun där ech och schwätzen. A mir kënnen net äh am Kaffismarc liesen äh</td>
<td><audio src="Chamber_2020_10_29_1913.wav" controls=""> </audio></td>
</tr>
<tr class="even">
<td>Wat mer matgedeelt kréien. Mir hunn dat Resultat eréischt matgedeelt kritt, nodeem et</td>
<td><audio src="Chamber_2020_10_29_1914.wav" controls=""> </audio></td>
</tr>
</tbody>
</table>
</section>
<section id="metas-wav2vec-2.0-baevskiwav2vecframeworkselfsupervised2020" class="level2">
<h2 class="anchored" data-anchor-id="metas-wav2vec-2.0-baevskiwav2vecframeworkselfsupervised2020">Meta’s Wav2vec 2.0 (<span class="citation" data-cites="baevskiWav2vecFrameworkSelfSupervised2020">Baevski et al. (<a href="#ref-baevskiWav2vecFrameworkSelfSupervised2020" role="doc-biblioref">2020</a>)</span>)</h2>
<ul>
<li>Self-supervised learning of unlabeled speech</li>
<li>Trained on 0.5 million hours of speech from 128 languages</li>
<li>Pre-training with up to 1 billion of parameters</li>
<li>Highly useful for low-resource languages by building upon similar structures from related languages, i.e.&nbsp;German, Dutch, French</li>
<li>Components of Wav2vec 2.0 <img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/a1bc37ab-9c55-4ccc-86c0-c28515663f7a-1-6c0dd04c-d39e-4538-a764-6426877a4064.png" class="img-fluid" alt="(FineTuneWav2Vec2English?)"></li>
</ul>
</section>
<section id="preprocessing-for-fine-tuning" class="level2">
<h2 class="anchored" data-anchor-id="preprocessing-for-fine-tuning">Preprocessing for fine-tuning</h2>
<ul>
<li>Fine-tuning
<ul>
<li>adding a specific layer to the pretrained self-supervised model</li>
<li>new training data will enable the model to ‘learn’ Luxembourgish</li>
</ul></li>
<li>preparation of the training data for training
<ul>
<li>several cleaning steps of the text data</li>
</ul></li>
</ul>
<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load raw data for training</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span>pd.read_csv(<span class="st">"./lux_train.csv"</span>)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'sentence'</span>]<span class="op">=</span>df[<span class="st">'sentence'</span>].<span class="bu">apply</span>(<span class="bu">str</span>)</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co"># replace numbers to text</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'sentence'</span>]<span class="op">=</span>df[<span class="st">'sentence'</span>].<span class="bu">apply</span>(replaceInt)</span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># define train and test split</span></span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>train, test <span class="op">=</span> train_test_split(df, test_size<span class="op">=</span><span class="fl">0.15</span>, random_state<span class="op">=</span><span class="dv">42</span>)</span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>lux_voice_train <span class="op">=</span> Dataset.from_pandas(train)</span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>lux_voice_test <span class="op">=</span> Dataset.from_pandas(test)</span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Cleaning of text data</span></span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>chars_to_ignore_regex <span class="op">=</span> <span class="st">'[\,\?\.\!\+\;\:</span><span class="ch">\"</span><span class="st">\“\&amp;\‘\”\�\(\)\&lt;\&gt;\«\»\„]'</span></span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="co"># convert, also to lower case!</span></span>
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> remove_special_characters(batch):</span>
<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(chars_to_ignore_regex, <span class="st">''</span>, batch[<span class="st">"sentence"</span>]).lower() <span class="op">+</span> <span class="st">" "</span></span>
<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> batch</span>
<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a>lux_voice_train <span class="op">=</span> lux_voice_train.<span class="bu">map</span>(remove_special_characters)</span>
<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a>lux_voice_test <span class="op">=</span> lux_voice_test.<span class="bu">map</span>(remove_special_characters)</span>
<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="co"># replacing special characters</span></span>
<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> replace_hatted_characters(batch):</span>
<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[â]'</span>, <span class="st">'a'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[î]'</span>, <span class="st">'i'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-28"><a href="#cb1-28" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[ô]'</span>, <span class="st">'o'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-29"><a href="#cb1-29" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[û]'</span>, <span class="st">'u'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[ê]'</span>, <span class="st">'e'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[ï]'</span>, <span class="st">'i'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[ή]'</span>, <span class="st">'n'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[ß]'</span>, <span class="st">'ss'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[ñ]'</span>, <span class="st">'n'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[/]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[’]'</span>, <span class="st">'</span><span class="ch">\'</span><span class="st">'</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[°]'</span>, <span class="st">' grad '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[%]'</span>, <span class="st">' prozent '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[@]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[#]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[_]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[–]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-43"><a href="#cb1-43" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[-]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-44"><a href="#cb1-44" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[</span><span class="ch">\\</span><span class="st">xad]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-45"><a href="#cb1-45" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'[</span><span class="ch">\\</span><span class="st">x01]'</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-46"><a href="#cb1-46" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sentence"</span>] <span class="op">=</span> re.sub(<span class="st">'  '</span>, <span class="st">' '</span>, batch[<span class="st">"sentence"</span>])</span>
<span id="cb1-47"><a href="#cb1-47" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> batch</span>
<span id="cb1-48"><a href="#cb1-48" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-49"><a href="#cb1-49" aria-hidden="true" tabindex="-1"></a>lux_voice_train <span class="op">=</span> lux_voice_train.<span class="bu">map</span>(replace_hatted_characters)</span>
<span id="cb1-50"><a href="#cb1-50" aria-hidden="true" tabindex="-1"></a>lux_voice_test <span class="op">=</span> lux_voice_test.<span class="bu">map</span>(replace_hatted_characters)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="preprocessing-for-fine-tuning-1" class="level2">
<h2 class="anchored" data-anchor-id="preprocessing-for-fine-tuning-1">Preprocessing for fine-tuning</h2>
<ul>
<li>Extract all distinct letters</li>
</ul>
<div class="sourceCode" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> extract_all_chars(batch):</span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>  all_text <span class="op">=</span> <span class="st">" "</span>.join(batch[<span class="st">"sentence"</span>])</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>  vocab <span class="op">=</span> <span class="bu">list</span>(<span class="bu">set</span>(all_text))</span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>  <span class="cf">return</span> {<span class="st">"vocab"</span>: [vocab], <span class="st">"all_text"</span>: [all_text]}</span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>vocab_train <span class="op">=</span> lux_voice_train.<span class="bu">map</span>(extract_all_chars, batched<span class="op">=</span><span class="va">True</span>, batch_size<span class="op">=-</span><span class="dv">1</span>, keep_in_memory<span class="op">=</span><span class="va">True</span>, remove_columns<span class="op">=</span>lux_voice_train.column_names)</span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>vocab_test <span class="op">=</span> lux_voice_test.<span class="bu">map</span>(extract_all_chars, batched<span class="op">=</span><span class="va">True</span>, batch_size<span class="op">=-</span><span class="dv">1</span>, keep_in_memory<span class="op">=</span><span class="va">True</span>, remove_columns<span class="op">=</span>lux_voice_test.column_names)</span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>vocab_list <span class="op">=</span> <span class="bu">list</span>(<span class="bu">set</span>(vocab_train[<span class="st">"vocab"</span>][<span class="dv">0</span>]) <span class="op">|</span> <span class="bu">set</span>(vocab_test[<span class="st">"vocab"</span>][<span class="dv">0</span>]))</span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>vocab_dict <span class="op">=</span> {v: k <span class="cf">for</span> k, v <span class="kw">in</span> <span class="bu">enumerate</span>(<span class="bu">sorted</span>(vocab_list))}</span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(vocab_dict)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<ul>
<li>Vocabulary dictionary containing all letters in the training data</li>
</ul>
<div class="sourceCode" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>{<span class="st">' '</span>: <span class="dv">0</span>, <span class="st">'$'</span>: <span class="dv">1</span>, <span class="st">"'"</span>: <span class="dv">2</span>, <span class="st">'0'</span>: <span class="dv">3</span>, <span class="st">'1'</span>: <span class="dv">4</span>, <span class="st">'2'</span>: <span class="dv">5</span>, <span class="st">'3'</span>: <span class="dv">6</span>, <span class="st">'4'</span>: <span class="dv">7</span>, <span class="st">'5'</span>: <span class="dv">8</span>, <span class="st">'6'</span>: <span class="dv">9</span>, <span class="st">'7'</span>: <span class="dv">10</span>, <span class="st">'8'</span>: <span class="dv">11</span>, <span class="st">'9'</span>: <span class="dv">12</span>, <span class="st">'</span><span class="ch">\\</span><span class="st">'</span>: <span class="dv">13</span>, <span class="st">'^'</span>: <span class="dv">14</span>, <span class="st">'a'</span>: <span class="dv">15</span>, <span class="st">'b'</span>: <span class="dv">16</span>, <span class="st">'c'</span>: <span class="dv">17</span>, <span class="st">'d'</span>: <span class="dv">18</span>, <span class="st">'e'</span>: <span class="dv">19</span>, <span class="st">'f'</span>: <span class="dv">20</span>, <span class="st">'g'</span>: <span class="dv">21</span>, <span class="st">'h'</span>: <span class="dv">22</span>, <span class="st">'i'</span>: <span class="dv">23</span>, <span class="st">'j'</span>: <span class="dv">24</span>, <span class="st">'k'</span>: <span class="dv">25</span>, <span class="st">'l'</span>: <span class="dv">26</span>, <span class="st">'m'</span>: <span class="dv">27</span>, <span class="st">'n'</span>: <span class="dv">28</span>, <span class="st">'o'</span>: <span class="dv">29</span>, <span class="st">'p'</span>: <span class="dv">30</span>, <span class="st">'q'</span>: <span class="dv">31</span>, <span class="st">'r'</span>: <span class="dv">32</span>, <span class="st">'s'</span>: <span class="dv">33</span>, <span class="st">'t'</span>: <span class="dv">34</span>, <span class="st">'u'</span>: <span class="dv">35</span>, <span class="st">'v'</span>: <span class="dv">36</span>, <span class="st">'w'</span>: <span class="dv">37</span>, <span class="st">'x'</span>: <span class="dv">38</span>, <span class="st">'y'</span>: <span class="dv">39</span>, <span class="st">'z'</span>: <span class="dv">40</span>, <span class="st">'à'</span>: <span class="dv">41</span>, <span class="st">'á'</span>: <span class="dv">42</span>, <span class="st">'ä'</span>: <span class="dv">43</span>, <span class="st">'ç'</span>: <span class="dv">44</span>, <span class="st">'è'</span>: <span class="dv">45</span>, <span class="st">'é'</span>: <span class="dv">46</span>, <span class="st">'ë'</span>: <span class="dv">47</span>, <span class="st">'ö'</span>: <span class="dv">48</span>, <span class="st">'ú'</span>: <span class="dv">49</span>, <span class="st">'ü'</span>: <span class="dv">50</span>, <span class="st">'œ'</span>: <span class="dv">51</span>, <span class="st">'‚'</span>: <span class="dv">52</span>, <span class="st">'…'</span>: <span class="dv">53</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="preprocessing-for-fine-tuning-2" class="level2">
<h2 class="anchored" data-anchor-id="preprocessing-for-fine-tuning-2">Preprocessing for fine-tuning</h2>
<ul>
<li>Convert the audio data to vectors</li>
</ul>
<div class="sourceCode" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> speech_file_to_array_fn(batch):</span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>    speech_array, sampling_rate <span class="op">=</span> sf.read(batch[<span class="st">"path"</span>])</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"speech"</span>] <span class="op">=</span> speech_array</span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"sampling_rate"</span>] <span class="op">=</span> sampling_rate</span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"target_text"</span>] <span class="op">=</span> batch[<span class="st">"sentence"</span>]</span>
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> batch</span>
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>lux_voice_train <span class="op">=</span> lux_voice_train.<span class="bu">map</span>(speech_file_to_array_fn, remove_columns<span class="op">=</span>lux_voice_train.column_names,num_proc<span class="op">=</span><span class="dv">8</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<ul>
<li>Prepare the model tokenizer, feature extractor and processor</li>
</ul>
<div class="sourceCode" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> Wav2Vec2CTCTokenizer</span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>tokenizer <span class="op">=</span> Wav2Vec2CTCTokenizer(<span class="st">"./vocab.json"</span>, unk_token<span class="op">=</span><span class="st">"[UNK]"</span>, pad_token<span class="op">=</span><span class="st">"[PAD]"</span>, word_delimiter_token<span class="op">=</span><span class="st">"|"</span>)</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> Wav2Vec2FeatureExtractor</span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>feature_extractor <span class="op">=</span> Wav2Vec2FeatureExtractor(feature_size<span class="op">=</span><span class="dv">1</span>, sampling_rate<span class="op">=</span><span class="dv">16000</span>, padding_value<span class="op">=</span><span class="fl">0.0</span>, do_normalize<span class="op">=</span><span class="va">True</span>, return_attention_mask<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> Wav2Vec2Processor</span>
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a>processor <span class="op">=</span> Wav2Vec2Processor(feature_extractor<span class="op">=</span>feature_extractor, tokenizer<span class="op">=</span>tokenizer)</span>
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>processor.save_pretrained(<span class="st">"./wav2vec2-large-xls-r-LUXEMBOURGISH_NEW_DATASET"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<ul>
<li>Preprocess the audio vectors and labels with the Wav2Vec2ForCTC processor</li>
</ul>
<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_dataset(batch):</span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>    <span class="co"># check that all files have the correct sampling rate</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">assert</span> (</span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>        <span class="bu">len</span>(<span class="bu">set</span>(batch[<span class="st">"sampling_rate"</span>])) <span class="op">==</span> <span class="dv">1</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>    ), <span class="ss">f"Make sure all inputs have the same sampling rate of </span><span class="sc">{</span>processor<span class="sc">.</span>feature_extractor<span class="sc">.</span>sampling_rate<span class="sc">}</span><span class="ss">."</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>    batch[<span class="st">"input_values"</span>] <span class="op">=</span> processor(batch[<span class="st">"speech"</span>], sampling_rate<span class="op">=</span>batch[<span class="st">"sampling_rate"</span>][<span class="dv">0</span>]).input_values</span>
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>    </span>
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> processor.as_target_processor():</span>
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>        batch[<span class="st">"labels"</span>] <span class="op">=</span> processor(batch[<span class="st">"target_text"</span>]).input_ids</span>
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> batch</span>
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a>    </span>
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>lux_voice_test <span class="op">=</span> lux_voice_test.<span class="bu">map</span>(prepare_dataset, remove_columns<span class="op">=</span>lux_voice_test.column_names, batch_size<span class="op">=</span><span class="dv">8</span>, num_proc<span class="op">=</span><span class="dv">8</span>, batched<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<ul>
<li>Audio data is now in an 1-dimensional array shape</li>
</ul>
<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>  {<span class="st">'array'</span>: array([ <span class="fl">0.0000000e+00</span>,  <span class="fl">0.0000000e+00</span>,  <span class="fl">0.0000000e+00</span>, ...,</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>            <span class="op">-</span><span class="fl">7.4556941e-05</span>, <span class="op">-</span><span class="fl">1.4621433e-05</span>, <span class="op">-</span><span class="fl">5.7861507e-05</span>], dtype<span class="op">=</span>float32),</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>     <span class="st">'path'</span>: <span class="st">'/root/.cache/huggingface/datasets/downloads/extracted/05be0c29807a73c9b099873d2f5975dae6d05e9f7d577458a2466ecb9a2b0c6b/cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_21921195.mp3'</span>,</span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>     <span class="st">'sampling_rate'</span>: <span class="dv">16000</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="connectionist-temporal-classification-ctc" class="level2">
<h2 class="anchored" data-anchor-id="connectionist-temporal-classification-ctc">Connectionist Temporal Classification (CTC)</h2>
<p>-Used to decode a sound sequence input</p>
<p><img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/f212d6ce-5173-4725-95c5-3af7113559cd-1-262023d8-578c-44cc-81a9-24bdc4bf1bb3.png" class="img-fluid"></p>
</section>
<section id="training-i.e.-fine-tuning" class="level2">
<h2 class="anchored" data-anchor-id="training-i.e.-fine-tuning">Training, i.e.&nbsp;fine-tuning</h2>
<ul>
<li>Load the pre-trained model from Hugging Face</li>
<li>Load the preprocessed dataset</li>
<li>Define training parameters</li>
<li>Wait for 30 hours</li>
</ul>
<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> Wav2Vec2ForCTC.from_pretrained(</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">"facebook/wav2vec2-xls-r-300m"</span>, </span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    <span class="co">#"facebook/wav2vec2-xls-r-2b", </span></span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>    attention_dropout<span class="op">=</span><span class="fl">0.0</span>,</span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>    hidden_dropout<span class="op">=</span><span class="fl">0.0</span>,</span>
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>    feat_proj_dropout<span class="op">=</span><span class="fl">0.0</span>,</span>
<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>    mask_time_prob<span class="op">=</span><span class="fl">0.05</span>, <span class="co">#0.05</span></span>
<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>    layerdrop<span class="op">=</span><span class="fl">0.0</span>,</span>
<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a>    ctc_loss_reduction<span class="op">=</span><span class="st">"mean"</span>,</span>
<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a>    ctc_zero_infinity<span class="op">=</span><span class="va">True</span>, <span class="co">#Missing</span></span>
<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a>    pad_token_id<span class="op">=</span>processor.tokenizer.pad_token_id,</span>
<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>    vocab_size<span class="op">=</span><span class="bu">len</span>(processor.tokenizer),</span>
<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a>)</span>
<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Pretrained model loaded"</span>)</span>
<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a>model.freeze_feature_encoder()</span>
<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a>training_args <span class="op">=</span> TrainingArguments(</span>
<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a>  output_dir<span class="op">=</span><span class="st">"./wav2vec2-large-xls-r-LUXEMBOURGISH"</span>,</span>
<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a>  group_by_length<span class="op">=</span><span class="va">True</span>,</span>
<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a>    adam_beta1<span class="op">=</span><span class="fl">0.9</span>,</span>
<span id="cb8-22"><a href="#cb8-22" aria-hidden="true" tabindex="-1"></a>    adam_beta2<span class="op">=</span><span class="fl">0.998</span>,</span>
<span id="cb8-23"><a href="#cb8-23" aria-hidden="true" tabindex="-1"></a>    adam_epsilon<span class="op">=</span><span class="fl">1e-6</span>,</span>
<span id="cb8-24"><a href="#cb8-24" aria-hidden="true" tabindex="-1"></a>    weight_decay<span class="op">=</span><span class="fl">0.005</span>, <span class="co"># Leo: to try 0.01</span></span>
<span id="cb8-25"><a href="#cb8-25" aria-hidden="true" tabindex="-1"></a>    per_device_train_batch_size<span class="op">=</span><span class="dv">18</span>, <span class="co">#16, 24, 32(TPU?)</span></span>
<span id="cb8-26"><a href="#cb8-26" aria-hidden="true" tabindex="-1"></a>    gradient_accumulation_steps<span class="op">=</span><span class="dv">2</span>, </span>
<span id="cb8-27"><a href="#cb8-27" aria-hidden="true" tabindex="-1"></a>    gradient_checkpointing<span class="op">=</span><span class="va">True</span>, <span class="co">#Missing</span></span>
<span id="cb8-28"><a href="#cb8-28" aria-hidden="true" tabindex="-1"></a>    evaluation_strategy<span class="op">=</span><span class="st">"steps"</span>,</span>
<span id="cb8-29"><a href="#cb8-29" aria-hidden="true" tabindex="-1"></a>    num_train_epochs<span class="op">=</span><span class="dv">30</span>, <span class="co"># Leo: 20, 30 epochs, 40, yqs  0.5 - Just for demo, change it</span></span>
<span id="cb8-30"><a href="#cb8-30" aria-hidden="true" tabindex="-1"></a>    fp16<span class="op">=</span><span class="va">True</span>, <span class="co"># Leo: Just on CUDA devices</span></span>
<span id="cb8-31"><a href="#cb8-31" aria-hidden="true" tabindex="-1"></a>    save_steps<span class="op">=</span><span class="dv">400</span>, <span class="co">#20  Just for demo, change it</span></span>
<span id="cb8-32"><a href="#cb8-32" aria-hidden="true" tabindex="-1"></a>    eval_steps<span class="op">=</span><span class="dv">400</span>, <span class="co">#20  Just for demo, change it</span></span>
<span id="cb8-33"><a href="#cb8-33" aria-hidden="true" tabindex="-1"></a>    logging_steps<span class="op">=</span><span class="dv">400</span>, <span class="co">#20  Just for demo, change it</span></span>
<span id="cb8-34"><a href="#cb8-34" aria-hidden="true" tabindex="-1"></a>    learning_rate<span class="op">=</span><span class="fl">3e-4</span>, </span>
<span id="cb8-35"><a href="#cb8-35" aria-hidden="true" tabindex="-1"></a>    warmup_steps<span class="op">=</span><span class="dv">800</span>, <span class="co">#400, 20 Just for demo, change it</span></span>
<span id="cb8-36"><a href="#cb8-36" aria-hidden="true" tabindex="-1"></a>    save_total_limit<span class="op">=</span><span class="dv">5</span>,</span>
<span id="cb8-37"><a href="#cb8-37" aria-hidden="true" tabindex="-1"></a>    metric_for_best_model<span class="op">=</span><span class="st">"eval_loss"</span>, <span class="co"># Leo for early stopping: eval_wer</span></span>
<span id="cb8-38"><a href="#cb8-38" aria-hidden="true" tabindex="-1"></a>    load_best_model_at_end<span class="op">=</span><span class="va">True</span>,  <span class="co"># Leo for early stopping</span></span>
<span id="cb8-39"><a href="#cb8-39" aria-hidden="true" tabindex="-1"></a>    greater_is_better<span class="op">=</span><span class="va">False</span>, <span class="co"># Leo for early stopping</span></span>
<span id="cb8-40"><a href="#cb8-40" aria-hidden="true" tabindex="-1"></a>  push_to_hub<span class="op">=</span><span class="va">False</span></span>
<span id="cb8-41"><a href="#cb8-41" aria-hidden="true" tabindex="-1"></a>)</span>
<span id="cb8-42"><a href="#cb8-42" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-43"><a href="#cb8-43" aria-hidden="true" tabindex="-1"></a>trainer <span class="op">=</span> Trainer(</span>
<span id="cb8-44"><a href="#cb8-44" aria-hidden="true" tabindex="-1"></a>    model<span class="op">=</span>model,</span>
<span id="cb8-45"><a href="#cb8-45" aria-hidden="true" tabindex="-1"></a>    data_collator<span class="op">=</span>data_collator,</span>
<span id="cb8-46"><a href="#cb8-46" aria-hidden="true" tabindex="-1"></a>    args<span class="op">=</span>training_args,</span>
<span id="cb8-47"><a href="#cb8-47" aria-hidden="true" tabindex="-1"></a>    compute_metrics<span class="op">=</span>compute_metrics,</span>
<span id="cb8-48"><a href="#cb8-48" aria-hidden="true" tabindex="-1"></a>    train_dataset<span class="op">=</span>common_voice_train,</span>
<span id="cb8-49"><a href="#cb8-49" aria-hidden="true" tabindex="-1"></a>    eval_dataset<span class="op">=</span>common_voice_test,</span>
<span id="cb8-50"><a href="#cb8-50" aria-hidden="true" tabindex="-1"></a>    tokenizer<span class="op">=</span>processor.feature_extractor</span>
<span id="cb8-51"><a href="#cb8-51" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="attaching-a-language-model" class="level2">
<h2 class="anchored" data-anchor-id="attaching-a-language-model">Attaching a language model</h2>
<div>

</div>
<div class="quarto-layout-panel" data-layout-ncol="2">
<div class="quarto-layout-row">
<section id="section" class="level3 quarto-layout-cell" style="flex-basis: 50.0%;justify-content: flex-start;">
<h3 class="anchored" data-anchor-id="section"></h3>
<ul>
<li>Small vocabulary in the training data
<ul>
<li>Gibberish in the results as only phonetic segments are recognised <em>Conservatoire</em>: <em>crosefwert oach</em> or <em>konssrwat dooane</em></li>
</ul></li>
<li>Compilation of a language model for Luxembourgish
<ul>
<li>Available text data (Chamber, RTL articles, 100,7 articles etc.)</li>
<li>77 Mio. word tokens</li>
<li>Extract ngrams (1-grams, 2-grams, …, 5-grams) and their frequency</li>
</ul></li>
<li>Language model adds probability for words to the acoustic model</li>
</ul>
</section>
<section id="section-1" class="level3 quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
<h3 class="anchored" data-anchor-id="section-1"></h3>
<div class="quarto-figure quarto-figure-left">
<figure class="figure">
<p><img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/c9c84edb-659c-468a-8acb-8bff777b8074-1-9b2955a1-2122-4124-b7aa-3adf994ca44c.png" class="img-fluid quarto-figure quarto-figure-left figure-img" style="width:60.0%"></p>
</figure>
</div>
</section>
</div>
</div>
</section>
<section id="openais-whisper-radfordrobustspeechrecognition2022" class="level2">
<h2 class="anchored" data-anchor-id="openais-whisper-radfordrobustspeechrecognition2022">OpenAI’s Whisper (<span class="citation" data-cites="radfordRobustSpeechRecognition2022">Radford et al. (<a href="#ref-radfordRobustSpeechRecognition2022" role="doc-biblioref">2022</a>)</span>)</h2>
<ul>
<li>Introduced in September 2022</li>
<li>Sequence-to-sequence (seq2seq) architecture to learn the contextual representation of speech data</li>
<li>680,000 hours of training data from many languages, with transcriptions (e.g.&nbsp;YouTube videos)</li>
<li>‘weakly-supervised’ pretraining</li>
<li>Pre-training with up to 1.5 billion of parameters</li>
</ul>
<table class="table">
<thead>
<tr class="header">
<th>Size</th>
<th>Layers</th>
<th>Width</th>
<th>Heads</th>
<th>Parameters</th>
<th>English-only</th>
<th>Multilingual</th>
<th></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>tiny</td>
<td>4</td>
<td>38</td>
<td>6</td>
<td>39 M</td>
<td>✓</td>
<td>✓</td>
<td></td>
</tr>
<tr class="even">
<td>base</td>
<td>6</td>
<td>512</td>
<td>8</td>
<td>74 M</td>
<td>✓</td>
<td>✓</td>
<td></td>
</tr>
<tr class="odd">
<td>small</td>
<td>12</td>
<td>768</td>
<td>12</td>
<td>244 M</td>
<td>✓</td>
<td>✓</td>
<td></td>
</tr>
<tr class="even">
<td>medium</td>
<td>24</td>
<td>1024</td>
<td>16</td>
<td>769 M</td>
<td>✓</td>
<td>✓</td>
<td></td>
</tr>
<tr class="odd">
<td>large</td>
<td>32</td>
<td>1280</td>
<td>20</td>
<td>1550 M</td>
<td>x</td>
<td>✓</td>
<td></td>
</tr>
</tbody>
</table>
<ul>
<li>Luxembourgish included as language
<ul>
<li>with very bad quality, though ;-)</li>
</ul></li>
<li>Fine-tuning with our dataset (<span class="citation" data-cites="gandhiFineTuneWhisperMultilingual2022">Gandhi (<a href="#ref-gandhiFineTuneWhisperMultilingual2022" role="doc-biblioref">2022</a>)</span>)</li>
</ul>
</section>
<section id="whisper" class="level2">
<h2 class="anchored" data-anchor-id="whisper">Whisper</h2>
<ul>
<li>Building blocks of Whisper</li>
</ul>
<p><img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/d70eafe7-304a-4ec6-9cb0-433b6730e430-2-3a2a54f2-c345-408c-8194-1b464a8e184f.png" class="img-fluid"></p>
<ul>
<li>Performance of Whisper (Word Error Rate) <img src="Speech%20Recognition%20-%20Data%20Science%20and%20the%20Humanities%20-%202024_files/figure-html/d70eafe7-304a-4ec6-9cb0-433b6730e430-1-3741c1d2-45eb-48fd-9748-8c9fe46808ae.png" class="img-fluid"></li>
</ul>
</section>
<section id="evaluation-of-wav2vec-2.0-and-whisper" class="level2">
<h2 class="anchored" data-anchor-id="evaluation-of-wav2vec-2.0-and-whisper">Evaluation of wav2vec 2.0 and Whisper</h2>
<audio controls="" src="Chamber2022.wav">
</audio>
<table class="table">
<colgroup>
<col style="width: 33%">
<col style="width: 33%">
<col style="width: 33%">
</colgroup>
<thead>
<tr class="header">
<th>Ground truth</th>
<th>Wav2vec 2.0 (1B)</th>
<th>Whisper (large-v2)</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Villmools merci, Här President. Den Avis vun den Experten huet kloer gewisen, datt d’Covid-Kris nach net eriwwer ass, dass de Risk nach ëmmer do ass an datt d’Expektative fir September, zumindest wat d’Experten ugeet, déi sinn, datt mer eventuell virun enger neier Well kënne stoen. Op wellechem Datum, wellech Variant, mat wellecher Virulenz, welleger Ustiechlegkeet, dat wësse mer an dësem Moment selbstverständlech net. Déi Zuelen, déi mer haut kennen, soen dat selwecht. De Staatsminister huet gëschter zitéiert: 1200 Infektiounen den Dag. Dat ass eppes, muss ech soen, wat eis virun enger Rei Joer, virun enger Rei Méint jo weesentlech méi erschreckt huet, wéi et eis haut erschreckt, well d’Situatioun</td>
<td>villmools merci här president den avis vun den experten huet kloer gewisen datt d’covidkris nach net eriwwer ass dass de risk nach ëmmer do ass an datt d’expektative fir september zumindest wat d’experten ugeet déi sinn datt mer eventuell virun enger neier well kënne stoen op wellechem datum <strong>well ech</strong> variant mat wellecher virulenz wellecher <strong>ustieche keet</strong> dat wësse mer an dësem moment selbstverständlech net déi zuelen déi mer haut kennen soen dat selwecht de staatsminister huet gëschter zitéiert den <strong>auszweehonnert</strong> infektiounen den dag dat ass eppes muss ech soen wat eis virun enger rei <strong>t eis</strong> haut erschreckt <strong>war</strong> d’situatioun</td>
<td>Villmools merci, Här President. Den Avis vun den Experten huet kloer gewisen, datt d’Covidkris nach net eriwwer ass, dass de Risk nach ëmmer do ass an datt d’Expektative fir September zumindest wat d’Experten ugeet, déi sinn, datt mer eventuell virun enger neier Well kënne stoen. Op <strong>wellegem</strong> Datum, <strong>welleg</strong> Variant, mat <strong>welleger</strong> Virulenz, welleger Ustiech<strong>tegkeet</strong>, dat wësse mer an dësem Moment selbstverständlech net. Déi Zuelen, déi mer haut kennen, soen dat selwecht. De Staatsminister huet gëschter zitéiert: 1200 Infektiounen den Dag. Dat ass eppes, muss ech soen, wat eis virun enger Rei Joer, virun enger Rei Méint jo weesentlech méi erschreckt huet, wéi et eis haut erschreckt, well d’Situatioun</td>
</tr>
</tbody>
</table>
</section>
<section id="evaluation" class="level2">
<h2 class="anchored" data-anchor-id="evaluation">Evaluation</h2>
<ul>
<li>Word Error Rates (WER)
<ul>
<li>wav2vec 2.0: 10</li>
<li>Whisper: 13</li>
</ul></li>
<li>However, superior performance of Whisper over wav2vec 2.0
<ul>
<li>less errors</li>
<li>numbers recognised</li>
<li>capitalisation and punctuation restored</li>
</ul></li>
</ul>
</section>
<section id="next-challenge-multilingual-asr" class="level2">
<h2 class="anchored" data-anchor-id="next-challenge-multilingual-asr">Next challenge: Multilingual ASR</h2>
<audio controls="" src="Grand_Duc_2018.wav">
</audio>
<table class="table">
<colgroup>
<col style="width: 50%">
<col style="width: 50%">
</colgroup>
<thead>
<tr class="header">
<th>Wav2vec 2.0</th>
<th>Whisper</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>mir sollen houfreg sinn op d’ diversitéit an den zesummenhalt an eiser gesellschaft <strong>a setandroivoudr remercier les non luxembourgeois kee résidu qui travaille en entrep pour leur contribution précieuse an neutre société la cohésion économique mee aus si la sociale den autrepéys sont des atouts kinsapartier de défense dat oury i sont d’ acer den entree tenure reussite senatus</strong> haut op dësem chrescht owent wëll ech meng unerkennung awer net nëmmen op de politesche plang begrenzen</td>
<td>Mir sollen houfreg sinn op d’Diversitéit an den Zesummenhalt an eise Gesellschaft. <strong>A cet endroit, je voudrais remercier les non luxembourgeois qui resident ou qui travaillent à notre pays pour leur contribution précieuse à notre société. cohésion économique, mais aussi la cohésion sociale de notre pays sont des atouts qui nous appartiennent de défendre à tout prix. Ils sont au coeur de notre projet et de notre réussite. C’ est notre bien commun à tous.</strong> Haut, op dësem chrëschtel Wënd, wëll ech meng Unerkennung awer net nëmmen op de politesche Plang begrenzen.</td>
</tr>
<tr class="even">
<td>&gt; <em>Problems with French part</em></td>
<td>&gt; <em>French switch recognised</em></td>
</tr>
</tbody>
</table>
</section>
<section id="demos" class="level2">
<h2 class="anchored" data-anchor-id="demos">Demos</h2>
<ul>
<li>Available online at Hugging Face: <a href="https://huggingface.co/unilux" class="uri">https://huggingface.co/unilux</a></li>
<li>Further reading
<ul>
<li><span class="citation" data-cites="nguyen2022">(<a href="#ref-nguyen2022" role="doc-biblioref"><strong>nguyen2022?</strong></a>)</span></li>
<li><span class="citation" data-cites="gillesLUXASRBuildingASR2023">Gilles, Hosseini Kivanani, and Hillah (<a href="#ref-gillesLUXASRBuildingASR2023" role="doc-biblioref">2023</a>)</span></li>
<li><span class="citation" data-cites="gillesASRLUXAUTOMATICSPEECHinpress">Gilles, Hosseini Kivanani, and Hillah (<a href="#ref-gillesASRLUXAUTOMATICSPEECHinpress" role="doc-biblioref">n.d.</a>)</span></li>
</ul></li>
</ul>
</section>
<section id="references" class="level2">
<h2 class="anchored" data-anchor-id="references">References</h2>
<div id="refs" class="references csl-bib-body hanging-indent" data-entry-spacing="0" role="list">
<div id="ref-baevskiWav2vecFrameworkSelfSupervised2020" class="csl-entry" role="listitem">
Baevski, Alexei, Henry Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. <span>“Wav2vec 2.0: <span>A</span> <span>Framework</span> for <span>Self</span>-<span>Supervised</span> <span>Learning</span> of <span>Speech</span> <span>Representations</span>.”</span> arXiv. <a href="https://doi.org/10.48550/arXiv.2006.11477">https://doi.org/10.48550/arXiv.2006.11477</a>.
</div>
<div id="ref-boigneTimelineLargeTransformer2021" class="csl-entry" role="listitem">
Boigne, Jonathan. 2021. <span>“A <span>Timeline</span> of <span>Large</span> <span>Transformer</span> <span>Models</span> for <span>Speech</span>.”</span> <em>Jonathan Bgn</em>. <a href="https://jonathanbgn.com/2021/12/31/timeline-transformers-speech.html">https://jonathanbgn.com/2021/12/31/timeline-transformers-speech.html</a>.
</div>
<div id="ref-gandhiFineTuneWhisperMultilingual2022" class="csl-entry" role="listitem">
Gandhi, Sanchit. 2022. <span>“Fine-<span>Tune</span> <span>Whisper</span> <span>For</span> <span>Multilingual</span> <span>ASR</span> with 🤗 <span>Transformers</span>.”</span> <a href="https://huggingface.co/blog/fine-tune-whisper">https://huggingface.co/blog/fine-tune-whisper</a>.
</div>
<div id="ref-gillesLUXASRBuildingASR2023" class="csl-entry" role="listitem">
Gilles, Peter, Nina Hosseini Kivanani, and Léopold Edem Ayité Hillah. 2023. <span>“<span>LUX</span>-<span>ASR</span>: <span>Building</span> an <span>ASR</span> System for the <span>Luxembourgish</span> Language.”</span> In <em>Proceedings - 2022 <span>IEEE</span> <span>Spoken</span> <span>Language</span> <span>Technology</span> <span>Workshop</span> (<span>SLT</span>)</em>, 1147–49. Doha, Qatar. <a href="https://orbilu.uni.lu/handle/10993/55105">https://orbilu.uni.lu/handle/10993/55105</a>.
</div>
<div id="ref-gillesASRLUXAUTOMATICSPEECHinpress" class="csl-entry" role="listitem">
———. n.d. <span>“<span>ASRLUX</span>: <span>AUTOMATIC</span> <span>SPEECH</span> <span>RECOGNITION</span> <span>FOR</span> <span>THE</span> <span>LOW</span>-<span>RESOURCE</span> <span>LANGUAGE</span> <span>LUXEMBOURGISH</span>.”</span> In <em>Proceedings <span>ICPhS</span> 2023</em>. Prague.
</div>
<div id="ref-huiSpeechRecognitionFeature2019" class="csl-entry" role="listitem">
Hui, Jonathan. 2019. <span>“Speech <span>Recognition</span> — <span>Feature</span> <span>Extraction</span> <span>MFCC</span> &amp; <span>PLP</span>.”</span> <em>Medium</em>. <a href="https://jonathan-hui.medium.com/speech-recognition-feature-extraction-mfcc-plp-5455f5a69dd9">https://jonathan-hui.medium.com/speech-recognition-feature-extraction-mfcc-plp-5455f5a69dd9</a>.
</div>
<div id="ref-huiSpeechRecognitionPhonetics2022" class="csl-entry" role="listitem">
———. 2022. <span>“Speech <span>Recognition</span> — <span>Phonetics</span>.”</span> <em>Medium</em>. <a href="https://jonathan-hui.medium.com/speech-recognition-phonetics-d761ea1710c0">https://jonathan-hui.medium.com/speech-recognition-phonetics-d761ea1710c0</a>.
</div>
<div id="ref-radfordRobustSpeechRecognition2022" class="csl-entry" role="listitem">
Radford, Alec, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. <span>“Robust <span>Speech</span> <span>Recognition</span> via <span>Large</span>-<span>Scale</span> <span>Weak</span> <span>Supervision</span>.”</span> arXiv. <a href="https://doi.org/10.48550/arXiv.2212.04356">https://doi.org/10.48550/arXiv.2212.04356</a>.
</div>
</div>
</section>

</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const isCodeAnnotation = (el) => {
    for (const clz of el.classList) {
      if (clz.startsWith('code-annotation-')) {                     
        return true;
      }
    }
    return false;
  }
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    text: function(trigger) {
      const codeEl = trigger.previousElementSibling.cloneNode(true);
      for (const childEl of codeEl.children) {
        if (isCodeAnnotation(childEl)) {
          childEl.remove();
        }
      }
      return codeEl.innerText;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
    const config = {
      allowHTML: true,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start',
    };
    if (contentFn) {
      config.content = contentFn;
    }
    if (onTriggerFn) {
      config.onTrigger = onTriggerFn;
    }
    if (onUntriggerFn) {
      config.onUntrigger = onUntriggerFn;
    }
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const xrefs = window.document.querySelectorAll('a.quarto-xref');
  const processXRef = (id, note) => {
    // Strip column container classes
    const stripColumnClz = (el) => {
      el.classList.remove("page-full", "page-columns");
      if (el.children) {
        for (const child of el.children) {
          stripColumnClz(child);
        }
      }
    }
    stripColumnClz(note)
    if (id === null || id.startsWith('sec-')) {
      // Special case sections, only their first couple elements
      const container = document.createElement("div");
      if (note.children && note.children.length > 2) {
        container.appendChild(note.children[0].cloneNode(true));
        for (let i = 1; i < note.children.length; i++) {
          const child = note.children[i];
          if (child.tagName === "P" && child.innerText === "") {
            continue;
          } else {
            container.appendChild(child.cloneNode(true));
            break;
          }
        }
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(container);
        }
        return container.innerHTML
      } else {
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(note);
        }
        return note.innerHTML;
      }
    } else {
      // Remove any anchor links if they are present
      const anchorLink = note.querySelector('a.anchorjs-link');
      if (anchorLink) {
        anchorLink.remove();
      }
      if (window.Quarto?.typesetMath) {
        window.Quarto.typesetMath(note);
      }
      // TODO in 1.5, we should make sure this works without a callout special case
      if (note.classList.contains("callout")) {
        return note.outerHTML;
      } else {
        return note.innerHTML;
      }
    }
  }
  for (var i=0; i<xrefs.length; i++) {
    const xref = xrefs[i];
    tippyHover(xref, undefined, function(instance) {
      instance.disable();
      let url = xref.getAttribute('href');
      let hash = undefined; 
      if (url.startsWith('#')) {
        hash = url;
      } else {
        try { hash = new URL(url).hash; } catch {}
      }
      if (hash) {
        const id = hash.replace(/^#\/?/, "");
        const note = window.document.getElementById(id);
        if (note !== null) {
          try {
            const html = processXRef(id, note.cloneNode(true));
            instance.setContent(html);
          } finally {
            instance.enable();
            instance.show();
          }
        } else {
          // See if we can fetch this
          fetch(url.split('#')[0])
          .then(res => res.text())
          .then(html => {
            const parser = new DOMParser();
            const htmlDoc = parser.parseFromString(html, "text/html");
            const note = htmlDoc.getElementById(id);
            if (note !== null) {
              const html = processXRef(id, note);
              instance.setContent(html);
            } 
          }).finally(() => {
            instance.enable();
            instance.show();
          });
        }
      } else {
        // See if we can fetch a full url (with no hash to target)
        // This is a special case and we should probably do some content thinning / targeting
        fetch(url)
        .then(res => res.text())
        .then(html => {
          const parser = new DOMParser();
          const htmlDoc = parser.parseFromString(html, "text/html");
          const note = htmlDoc.querySelector('main.content');
          if (note !== null) {
            // This should only happen for chapter cross references
            // (since there is no id in the URL)
            // remove the first header
            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
              note.children[0].remove();
            }
            const html = processXRef(null, note);
            instance.setContent(html);
          } 
        }).finally(() => {
          instance.enable();
          instance.show();
        });
      }
    }, function(instance) {
    });
  }
      let selectedAnnoteEl;
      const selectorForAnnotation = ( cell, annotation) => {
        let cellAttr = 'data-code-cell="' + cell + '"';
        let lineAttr = 'data-code-annotation="' +  annotation + '"';
        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
        return selector;
      }
      const selectCodeLines = (annoteEl) => {
        const doc = window.document;
        const targetCell = annoteEl.getAttribute("data-target-cell");
        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
        const lineIds = lines.map((line) => {
          return targetCell + "-" + line;
        })
        let top = null;
        let height = null;
        let parent = null;
        if (lineIds.length > 0) {
            //compute the position of the single el (top and bottom and make a div)
            const el = window.document.getElementById(lineIds[0]);
            top = el.offsetTop;
            height = el.offsetHeight;
            parent = el.parentElement.parentElement;
          if (lineIds.length > 1) {
            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
            height = bottom - top;
          }
          if (top !== null && height !== null && parent !== null) {
            // cook up a div (if necessary) and position it 
            let div = window.document.getElementById("code-annotation-line-highlight");
            if (div === null) {
              div = window.document.createElement("div");
              div.setAttribute("id", "code-annotation-line-highlight");
              div.style.position = 'absolute';
              parent.appendChild(div);
            }
            div.style.top = top - 2 + "px";
            div.style.height = height + 4 + "px";
            div.style.left = 0;
            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
            if (gutterDiv === null) {
              gutterDiv = window.document.createElement("div");
              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
              gutterDiv.style.position = 'absolute';
              const codeCell = window.document.getElementById(targetCell);
              const gutter = codeCell.querySelector('.code-annotation-gutter');
              gutter.appendChild(gutterDiv);
            }
            gutterDiv.style.top = top - 2 + "px";
            gutterDiv.style.height = height + 4 + "px";
          }
          selectedAnnoteEl = annoteEl;
        }
      };
      const unselectCodeLines = () => {
        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
        elementsIds.forEach((elId) => {
          const div = window.document.getElementById(elId);
          if (div) {
            div.remove();
          }
        });
        selectedAnnoteEl = undefined;
      };
        // Handle positioning of the toggle
    window.addEventListener(
      "resize",
      throttle(() => {
        elRect = undefined;
        if (selectedAnnoteEl) {
          selectCodeLines(selectedAnnoteEl);
        }
      }, 10)
    );
    function throttle(fn, ms) {
    let throttle = false;
    let timer;
      return (...args) => {
        if(!throttle) { // first call gets through
            fn.apply(this, args);
            throttle = true;
        } else { // all the others get throttled
            if(timer) clearTimeout(timer); // cancel #2
            timer = setTimeout(() => {
              fn.apply(this, args);
              timer = throttle = false;
            }, ms);
        }
      };
    }
      // Attach click handler to the DT
      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
      for (const annoteDlNode of annoteDls) {
        annoteDlNode.addEventListener('click', (event) => {
          const clickedEl = event.target;
          if (clickedEl !== selectedAnnoteEl) {
            unselectCodeLines();
            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
            if (activeEl) {
              activeEl.classList.remove('code-annotation-active');
            }
            selectCodeLines(clickedEl);
            clickedEl.classList.add('code-annotation-active');
          } else {
            // Unselect the line
            unselectCodeLines();
            clickedEl.classList.remove('code-annotation-active');
          }
        });
      }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
</div> <!-- /content -->


</body></html>