Merge branch 'main' into remove-stuff

biaslyze-dev · Aug 3, 2023 · 543f675 · 543f675
2 parents 2f40a17 + da429be
commit 543f675
Show file tree

Hide file tree

Showing 26 changed files with 573 additions and 4,538 deletions.
diff --git a/biaslyze/bias_detectors/counterfactual_biasdetector.py b/biaslyze/bias_detectors/counterfactual_biasdetector.py
@@ -13,7 +13,6 @@
     CounterfactualDetectionResult,
     CounterfactualSample,
 )
-from biaslyze.augmentors import CounterfactualTextAugmentor
 from biaslyze.text_representation import TextRepresentation, process_texts_with_spacy
 
 
@@ -46,34 +45,53 @@ class CounterfactualBiasDetector:
         # see a summary of the detection
         detection_res.report()
 
-        # visualize the counterfactual scores
-        detection_res.visualize_counterfactual_scores(concept="religion")
-
-        # visualize the counterfactual sample scores
-        detection_res.visualize_counterfactual_score_by_sample_histogram(concepts=["religion", "gender"])
+        # visualize the counterfactual scores as a dash dashboard
+        detection_res.dashboard()
         ```
 
     Attributes:
+        lang: The language of the texts. Decides which concepts and keywords to use.
         use_tokenizer: If keywords should only be searched in tokenized text. Can be useful for short keywords like 'she'.
-        concept_detector: an instance of KeywordConceptDetector
-        text_augmentor: an instance of CounterfactualTextAugmentor
     """
 
     def __init__(
         self,
+        lang: str = "en",
         use_tokenizer: bool = False,
-        concept_detector: KeywordConceptDetector = KeywordConceptDetector(),
-        text_augmentor: CounterfactualTextAugmentor = CounterfactualTextAugmentor(), 
     ):
+        self.lang = lang
         self.use_tokenizer = use_tokenizer
-        self.concept_detector = concept_detector
-        self.text_augmentor = text_augmentor
-
-        # overwrite use_tokenizer
-        self.concept_detector.use_tokenizer = self.use_tokenizer
+        self.concept_detector = KeywordConceptDetector(lang=lang, use_tokenizer=use_tokenizer)
 
         # load the concepts
-        self.concepts = load_concepts()
+        self.concepts = load_concepts(lang=lang)
+
+    def register_concept(self, concept: Concept):
+        """Register a new, custom concept to the detector.
+
+        Example usage:
+        ```python
+        names_concept = Concept.from_dict_keyword_list(
+            name="names",
+            lang="de",
+            keywords=[{"keyword": "Hans", "function": ["name"]}],
+        )
+        bias_detector = CounterfactualBiasDetector(lang="de")
+        bias_detector.register_concept(names_concept)
+        ```
+        
+        Args:
+            concept: The concept to register.
+
+        Raises:
+            ValueError: If concept is not a Concept object.
+            ValueError: If a concept with this name is already registered.
+        """
+        if not isinstance(concept, Concept):
+            raise ValueError("concept must be a Concept object.")
+        if concept.name in [c.name for c in self.concepts]:
+            raise ValueError(f"Concept '{concept.name}' already registered.")
+        self.concepts.append(concept)
 
     def process(
         self,
@@ -99,6 +117,10 @@ def process(
 
         Raises:
             ValueError: If texts or predict_func is not given.
+            ValueError: If concepts_to_consider is not a list.
+            ValueError: If max_counterfactual_samples is given but not a positive integer.
+            ValueError: If max_counterfactual_samples_per_text is given but not a positive integer.
+            ValueError: If concepts_to_consider contains a concept that is not registered.
         """
         if texts is None:
             raise ValueError("texts must be given.")
@@ -120,9 +142,13 @@ def process(
                 raise ValueError(
                     "max_counterfactual_samples_per_text must be a positive integer."
                 )
+        if concepts_to_consider:
+            for c in concepts_to_consider:
+                if c not in [c.name for c in self.concepts]:
+                    raise ValueError(f"Concept '{c}' not found in language '{self.lang}'.")
 
         # find bias relevant texts
-        detected_texts = self.concept_detector.detect(texts)
+        detected_texts = self.concept_detector.detect(texts, concepts_to_consider=concepts_to_consider)
 
         # limit the number of counterfactual samples per text if max_counterfactual_samples is given
         if max_counterfactual_samples:
@@ -144,7 +170,7 @@ def process(
                 n_texts=max_counterfactual_samples_per_text,
             )
             if not counterfactual_samples:
-                logger.warning(f"No samples containing {concept.name} found. Skipping.")
+                logger.warning(f"No samples containing '{concept.name}' found. Skipping.")
                 continue
 
             # calculate counterfactual scores for each keyword

diff --git a/biaslyze/bias_detectors/lime_biasdetector.py b/biaslyze/bias_detectors/lime_biasdetector.py
@@ -8,7 +8,7 @@
 from tqdm import tqdm
 
 from biaslyze.concept_detectors import KeywordConceptDetector
-from biaslyze.concepts import CONCEPTS
+from biaslyze.concepts import CONCEPTS_EN as CONCEPTS
 from biaslyze.results.lime_detection_results import (
     LimeDetectionResult,
     LimeSampleResult,

diff --git a/biaslyze/concept_class.py b/biaslyze/concept_class.py
@@ -1,12 +1,9 @@
-"""
-This module contains the Concept class, which is used to represent a concept in the biaslyze package.
-As well as Keyword Class, which is used to represent a keyword in the biaslyze package.
-"""
+"""This module contains the Concept and a Keyword class, which is used to represent a concept or respectively a keyword in the biaslyze package."""
 
 import random
 from typing import List, Optional, Tuple
 
-from biaslyze.concepts import CONCEPTS
+from biaslyze.concepts import CONCEPTS_EN, CONCEPTS_DE
 from biaslyze.text_representation import TextRepresentation, Token
 
 
@@ -75,19 +72,53 @@ class Concept:
     """
     A class used to represent a concept in the biaslyze package.
 
+    Currently the following concepts are supported:
+
+    in English:
+
+    - gender
+    - religion
+    - ethnicity
+    - gendered_words
+    - nationality
+    
+    in German:
+
+    - gender
+    - religion
+
+    You can find out more here: [concepts.py](https://github.com/biaslyze-dev/biaslyze/blob/main/biaslyze/concepts.py).
+
     Attributes:
         name (str): The name of the concept.
         keywords (List[Keyword]): The keywords of the concept.
     """
 
-    def __init__(self, name: str, keywords: List[Keyword]):
+    def __init__(self, name: str, lang: str, keywords: List[Keyword]):
         """The constructor for the Concept class."""
         self.name = name
+        self.lang = lang
         self.keywords = keywords
 
     @classmethod
-    def from_dict_keyword_list(cls, name: str, keywords: List[dict]):
-        """Constructs a Concept object from a list of dictionaries."""
+    def from_dict_keyword_list(cls, name: str, lang: str, keywords: List[dict]):
+        """Constructs a Concept object from a list of dictionaries.
+
+        Example usage:
+        ```python
+        names_concept = Concept.from_dict_keyword_list(
+            name="names",
+            lang="de",
+            keywords=[{"keyword": "Hans", "function": ["name"]}],
+        )
+        ```
+        
+        Args:
+            name (str): The name of the concept.
+            lang (str): The language of the concept.
+            keywords (List[dict]): A list of dictionaries containing the keywords of the concept.
+
+        """
         keyword_list = []
         for keyword in keywords:
             keyword_list.append(
@@ -97,7 +128,7 @@ def from_dict_keyword_list(cls, name: str, keywords: List[dict]):
                     category=keyword.get("category", None),
                 )
             )
-        return cls(name, keyword_list)
+        return cls(name, lang, keyword_list)
 
     def get_present_keywords(
         self, text_representation: TextRepresentation
@@ -156,16 +187,26 @@ def get_counterfactual_texts(
         return counterfactual_texts
 
 
-def load_concepts() -> List[Concept]:
+def load_concepts(lang: str) -> List[Concept]:
     """Loads the concepts from the concepts.py file.
 
+    Args:
+        lang (str): The language of the concepts to load.
+
     TODO:
     - Make this load from a JSON file instead of a Python file.
-    - Accept a language parameter to load the concepts for a specific language.
     """
     concept_list = []
-    for concept_name, concept_keywords in CONCEPTS.items():
-        concept_list.append(
-            Concept.from_dict_keyword_list(concept_name, concept_keywords)
-        )
+    if lang == "en":
+        for concept_name, concept_keywords in CONCEPTS_EN.items():
+            concept_list.append(
+                Concept.from_dict_keyword_list(concept_name, lang, concept_keywords)
+            )
+    elif lang == "de":
+        for concept_name, concept_keywords in CONCEPTS_DE.items():
+            concept_list.append(
+                Concept.from_dict_keyword_list(concept_name, lang, concept_keywords)
+            )
+    else:
+        raise ValueError(f"Language {lang} not supported.")
     return concept_list
diff --git a/biaslyze/concept_detectors.py b/biaslyze/concept_detectors.py
@@ -1,33 +1,45 @@
 """This module contains classes to detect the presence of protected concepts in texts."""
-from typing import List
+from typing import List, Optional
 
 import spacy
 from loguru import logger
 from tqdm import tqdm
 
-from biaslyze.concepts import CONCEPTS
+from biaslyze.concepts import CONCEPTS_EN, CONCEPTS_DE
 
 
 class KeywordConceptDetector:
     """Use keywords to determine if a protected concept is present in text.
 
     Attributes:
+        lang: The language of the text. Currently only 'en' and 'de' are supported.
         use_tokenizer: If keywords should only be searched in tokenized text. Can be useful for short keywords like 'she'.
+    
+    Raises:
+        ValueError: If the language is not supported.
     """
 
-    def __init__(self, use_tokenizer: bool = False):
+    def __init__(self, lang: str = "en", use_tokenizer: bool = False):
+        lang = lang
         self.use_tokenizer = use_tokenizer
         self._tokenizer = spacy.load(
             "en_core_web_sm", disable=["parser", "tagger", "ner", "lemmatizer"]
         )
+        if lang == "en":
+            self.concepts = CONCEPTS_EN
+        elif lang == "de":
+            self.concepts = CONCEPTS_DE
+        else:
+            raise ValueError(f"Language {lang} not supported.")
 
-    def detect(self, texts: List[str]) -> List[str]:
+    def detect(self, texts: List[str], concepts_to_consider: Optional[List[str]] = None) -> List[str]:
         """Detect concepts present in texts.
 
         Returns a list of texts with the concept present.
 
         Args:
             texts: List of texts to look for protected concepts.
+            concepts_to_consider: List of concepts to consider. If None, all concepts are considered.
 
         Returns:
             List of texts where protected concepts are detected.
@@ -36,8 +48,9 @@ def detect(self, texts: List[str]) -> List[str]:
         detected_texts = []
         concept_keywords = [
             keyword["keyword"]
-            for concept_keywords in CONCEPTS.values()
+            for concept_name, concept_keywords in self.concepts.items()
             for keyword in concept_keywords
+            if (concepts_to_consider is None) or (concept_name in concepts_to_consider)
         ]
         for text in tqdm(texts):
             if self.use_tokenizer:

diff --git a/biaslyze/concepts.py b/biaslyze/concepts.py
@@ -2,7 +2,64 @@
 
 Function abbreviations can be found here: https://v2.spacy.io/api/annotation#pos-tagging 
 """
-CONCEPTS = {
+CONCEPTS_DE = {
+    "gender": [
+        {"keyword": "sie", "function": ["subject"], "category": "PRON"},
+        {"keyword": "er", "function": ["subject"], "category": "PRON"},
+        {"keyword": "ihr", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihn", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihre", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihm", "function": ["object"], "category": "PRON"},
+        {"keyword": "seine", "function": ["object"], "category": "PRON"},
+        {"keyword": "seinen", "function": ["object"], "category": "PRON"},
+        {"keyword": "seiner", "function": ["object"], "category": "PRON"},
+        {"keyword": "seines", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihres", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihren", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihrem", "function": ["object"], "category": "PRON"},
+        {"keyword": "ihnen", "function": ["object"], "category": "PRON"},
+    ],
+    "religion": [
+        {"keyword": "christ", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "christin", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "christen", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "christentum", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "christlich", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "christliche", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "christlichen", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "christlicher", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "christliches", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "christinnen", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "christi", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "christus", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "jude", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "jüdin", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "jüdisch", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "jüdische", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "jüdischen", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "jüdischer", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "jüdisches", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "jüdisches", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "judentum", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "muslim", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "muslima", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "muslimisch", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "muslimische", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "muslimischen", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "muslimischer", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "muslimisches", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "muslime", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "islam", "function": ["NOUN"], "category": "religion"},
+        {"keyword": "islamisch", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "islamische", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "islamischen", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "islamischer", "function": ["ADJ"], "category": "religion"},
+        {"keyword": "islamisches", "function": ["ADJ"], "category": "religion"},
+    ],        
+}
+
+
+CONCEPTS_EN = {
     "nationality": [
         {"keyword": "california", "function": ["NOUN"], "category": "region"},
         {"keyword": "afghanistan", "function": ["NOUN"], "category": "country"},