Skip to content

Commit

Permalink
Merge branch 'main' into remove-stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
tsterbak committed Aug 3, 2023
2 parents 2f40a17 + da429be commit 543f675
Show file tree
Hide file tree
Showing 26 changed files with 573 additions and 4,538 deletions.
62 changes: 44 additions & 18 deletions biaslyze/bias_detectors/counterfactual_biasdetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
CounterfactualDetectionResult,
CounterfactualSample,
)
from biaslyze.augmentors import CounterfactualTextAugmentor
from biaslyze.text_representation import TextRepresentation, process_texts_with_spacy


Expand Down Expand Up @@ -46,34 +45,53 @@ class CounterfactualBiasDetector:
# see a summary of the detection
detection_res.report()
# visualize the counterfactual scores
detection_res.visualize_counterfactual_scores(concept="religion")
# visualize the counterfactual sample scores
detection_res.visualize_counterfactual_score_by_sample_histogram(concepts=["religion", "gender"])
# visualize the counterfactual scores as a dash dashboard
detection_res.dashboard()
```
Attributes:
lang: The language of the texts. Decides which concepts and keywords to use.
use_tokenizer: If keywords should only be searched in tokenized text. Can be useful for short keywords like 'she'.
concept_detector: an instance of KeywordConceptDetector
text_augmentor: an instance of CounterfactualTextAugmentor
"""

def __init__(
self,
lang: str = "en",
use_tokenizer: bool = False,
concept_detector: KeywordConceptDetector = KeywordConceptDetector(),
text_augmentor: CounterfactualTextAugmentor = CounterfactualTextAugmentor(),
):
self.lang = lang
self.use_tokenizer = use_tokenizer
self.concept_detector = concept_detector
self.text_augmentor = text_augmentor

# overwrite use_tokenizer
self.concept_detector.use_tokenizer = self.use_tokenizer
self.concept_detector = KeywordConceptDetector(lang=lang, use_tokenizer=use_tokenizer)

# load the concepts
self.concepts = load_concepts()
self.concepts = load_concepts(lang=lang)

def register_concept(self, concept: Concept):
"""Register a new, custom concept to the detector.
Example usage:
```python
names_concept = Concept.from_dict_keyword_list(
name="names",
lang="de",
keywords=[{"keyword": "Hans", "function": ["name"]}],
)
bias_detector = CounterfactualBiasDetector(lang="de")
bias_detector.register_concept(names_concept)
```
Args:
concept: The concept to register.
Raises:
ValueError: If concept is not a Concept object.
ValueError: If a concept with this name is already registered.
"""
if not isinstance(concept, Concept):
raise ValueError("concept must be a Concept object.")
if concept.name in [c.name for c in self.concepts]:
raise ValueError(f"Concept '{concept.name}' already registered.")
self.concepts.append(concept)

def process(
self,
Expand All @@ -99,6 +117,10 @@ def process(
Raises:
ValueError: If texts or predict_func is not given.
ValueError: If concepts_to_consider is not a list.
ValueError: If max_counterfactual_samples is given but not a positive integer.
ValueError: If max_counterfactual_samples_per_text is given but not a positive integer.
ValueError: If concepts_to_consider contains a concept that is not registered.
"""
if texts is None:
raise ValueError("texts must be given.")
Expand All @@ -120,9 +142,13 @@ def process(
raise ValueError(
"max_counterfactual_samples_per_text must be a positive integer."
)
if concepts_to_consider:
for c in concepts_to_consider:
if c not in [c.name for c in self.concepts]:
raise ValueError(f"Concept '{c}' not found in language '{self.lang}'.")

# find bias relevant texts
detected_texts = self.concept_detector.detect(texts)
detected_texts = self.concept_detector.detect(texts, concepts_to_consider=concepts_to_consider)

# limit the number of counterfactual samples per text if max_counterfactual_samples is given
if max_counterfactual_samples:
Expand All @@ -144,7 +170,7 @@ def process(
n_texts=max_counterfactual_samples_per_text,
)
if not counterfactual_samples:
logger.warning(f"No samples containing {concept.name} found. Skipping.")
logger.warning(f"No samples containing '{concept.name}' found. Skipping.")
continue

# calculate counterfactual scores for each keyword
Expand Down
2 changes: 1 addition & 1 deletion biaslyze/bias_detectors/lime_biasdetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from tqdm import tqdm

from biaslyze.concept_detectors import KeywordConceptDetector
from biaslyze.concepts import CONCEPTS
from biaslyze.concepts import CONCEPTS_EN as CONCEPTS
from biaslyze.results.lime_detection_results import (
LimeDetectionResult,
LimeSampleResult,
Expand Down
71 changes: 56 additions & 15 deletions biaslyze/concept_class.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""
This module contains the Concept class, which is used to represent a concept in the biaslyze package.
As well as Keyword Class, which is used to represent a keyword in the biaslyze package.
"""
"""This module contains the Concept and a Keyword class, which is used to represent a concept or respectively a keyword in the biaslyze package."""

import random
from typing import List, Optional, Tuple

from biaslyze.concepts import CONCEPTS
from biaslyze.concepts import CONCEPTS_EN, CONCEPTS_DE
from biaslyze.text_representation import TextRepresentation, Token


Expand Down Expand Up @@ -75,19 +72,53 @@ class Concept:
"""
A class used to represent a concept in the biaslyze package.
Currently the following concepts are supported:
in English:
- gender
- religion
- ethnicity
- gendered_words
- nationality
in German:
- gender
- religion
You can find out more here: [concepts.py](https://github.com/biaslyze-dev/biaslyze/blob/main/biaslyze/concepts.py).
Attributes:
name (str): The name of the concept.
keywords (List[Keyword]): The keywords of the concept.
"""

def __init__(self, name: str, keywords: List[Keyword]):
def __init__(self, name: str, lang: str, keywords: List[Keyword]):
"""The constructor for the Concept class."""
self.name = name
self.lang = lang
self.keywords = keywords

@classmethod
def from_dict_keyword_list(cls, name: str, keywords: List[dict]):
"""Constructs a Concept object from a list of dictionaries."""
def from_dict_keyword_list(cls, name: str, lang: str, keywords: List[dict]):
"""Constructs a Concept object from a list of dictionaries.
Example usage:
```python
names_concept = Concept.from_dict_keyword_list(
name="names",
lang="de",
keywords=[{"keyword": "Hans", "function": ["name"]}],
)
```
Args:
name (str): The name of the concept.
lang (str): The language of the concept.
keywords (List[dict]): A list of dictionaries containing the keywords of the concept.
"""
keyword_list = []
for keyword in keywords:
keyword_list.append(
Expand All @@ -97,7 +128,7 @@ def from_dict_keyword_list(cls, name: str, keywords: List[dict]):
category=keyword.get("category", None),
)
)
return cls(name, keyword_list)
return cls(name, lang, keyword_list)

def get_present_keywords(
self, text_representation: TextRepresentation
Expand Down Expand Up @@ -156,16 +187,26 @@ def get_counterfactual_texts(
return counterfactual_texts


def load_concepts() -> List[Concept]:
def load_concepts(lang: str) -> List[Concept]:
"""Loads the concepts from the concepts.py file.
Args:
lang (str): The language of the concepts to load.
TODO:
- Make this load from a JSON file instead of a Python file.
- Accept a language parameter to load the concepts for a specific language.
"""
concept_list = []
for concept_name, concept_keywords in CONCEPTS.items():
concept_list.append(
Concept.from_dict_keyword_list(concept_name, concept_keywords)
)
if lang == "en":
for concept_name, concept_keywords in CONCEPTS_EN.items():
concept_list.append(
Concept.from_dict_keyword_list(concept_name, lang, concept_keywords)
)
elif lang == "de":
for concept_name, concept_keywords in CONCEPTS_DE.items():
concept_list.append(
Concept.from_dict_keyword_list(concept_name, lang, concept_keywords)
)
else:
raise ValueError(f"Language {lang} not supported.")
return concept_list
23 changes: 18 additions & 5 deletions biaslyze/concept_detectors.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,45 @@
"""This module contains classes to detect the presence of protected concepts in texts."""
from typing import List
from typing import List, Optional

import spacy
from loguru import logger
from tqdm import tqdm

from biaslyze.concepts import CONCEPTS
from biaslyze.concepts import CONCEPTS_EN, CONCEPTS_DE


class KeywordConceptDetector:
"""Use keywords to determine if a protected concept is present in text.
Attributes:
lang: The language of the text. Currently only 'en' and 'de' are supported.
use_tokenizer: If keywords should only be searched in tokenized text. Can be useful for short keywords like 'she'.
Raises:
ValueError: If the language is not supported.
"""

def __init__(self, use_tokenizer: bool = False):
def __init__(self, lang: str = "en", use_tokenizer: bool = False):
lang = lang
self.use_tokenizer = use_tokenizer
self._tokenizer = spacy.load(
"en_core_web_sm", disable=["parser", "tagger", "ner", "lemmatizer"]
)
if lang == "en":
self.concepts = CONCEPTS_EN
elif lang == "de":
self.concepts = CONCEPTS_DE
else:
raise ValueError(f"Language {lang} not supported.")

def detect(self, texts: List[str]) -> List[str]:
def detect(self, texts: List[str], concepts_to_consider: Optional[List[str]] = None) -> List[str]:
"""Detect concepts present in texts.
Returns a list of texts with the concept present.
Args:
texts: List of texts to look for protected concepts.
concepts_to_consider: List of concepts to consider. If None, all concepts are considered.
Returns:
List of texts where protected concepts are detected.
Expand All @@ -36,8 +48,9 @@ def detect(self, texts: List[str]) -> List[str]:
detected_texts = []
concept_keywords = [
keyword["keyword"]
for concept_keywords in CONCEPTS.values()
for concept_name, concept_keywords in self.concepts.items()
for keyword in concept_keywords
if (concepts_to_consider is None) or (concept_name in concepts_to_consider)
]
for text in tqdm(texts):
if self.use_tokenizer:
Expand Down
59 changes: 58 additions & 1 deletion biaslyze/concepts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,64 @@
Function abbreviations can be found here: https://v2.spacy.io/api/annotation#pos-tagging
"""
CONCEPTS = {
CONCEPTS_DE = {
"gender": [
{"keyword": "sie", "function": ["subject"], "category": "PRON"},
{"keyword": "er", "function": ["subject"], "category": "PRON"},
{"keyword": "ihr", "function": ["object"], "category": "PRON"},
{"keyword": "ihn", "function": ["object"], "category": "PRON"},
{"keyword": "ihre", "function": ["object"], "category": "PRON"},
{"keyword": "ihm", "function": ["object"], "category": "PRON"},
{"keyword": "seine", "function": ["object"], "category": "PRON"},
{"keyword": "seinen", "function": ["object"], "category": "PRON"},
{"keyword": "seiner", "function": ["object"], "category": "PRON"},
{"keyword": "seines", "function": ["object"], "category": "PRON"},
{"keyword": "ihres", "function": ["object"], "category": "PRON"},
{"keyword": "ihren", "function": ["object"], "category": "PRON"},
{"keyword": "ihrem", "function": ["object"], "category": "PRON"},
{"keyword": "ihnen", "function": ["object"], "category": "PRON"},
],
"religion": [
{"keyword": "christ", "function": ["NOUN"], "category": "religion"},
{"keyword": "christin", "function": ["NOUN"], "category": "religion"},
{"keyword": "christen", "function": ["NOUN"], "category": "religion"},
{"keyword": "christentum", "function": ["NOUN"], "category": "religion"},
{"keyword": "christlich", "function": ["ADJ"], "category": "religion"},
{"keyword": "christliche", "function": ["ADJ"], "category": "religion"},
{"keyword": "christlichen", "function": ["ADJ"], "category": "religion"},
{"keyword": "christlicher", "function": ["ADJ"], "category": "religion"},
{"keyword": "christliches", "function": ["ADJ"], "category": "religion"},
{"keyword": "christinnen", "function": ["NOUN"], "category": "religion"},
{"keyword": "christi", "function": ["NOUN"], "category": "religion"},
{"keyword": "christus", "function": ["NOUN"], "category": "religion"},
{"keyword": "jude", "function": ["NOUN"], "category": "religion"},
{"keyword": "jüdin", "function": ["NOUN"], "category": "religion"},
{"keyword": "jüdisch", "function": ["ADJ"], "category": "religion"},
{"keyword": "jüdische", "function": ["ADJ"], "category": "religion"},
{"keyword": "jüdischen", "function": ["ADJ"], "category": "religion"},
{"keyword": "jüdischer", "function": ["ADJ"], "category": "religion"},
{"keyword": "jüdisches", "function": ["ADJ"], "category": "religion"},
{"keyword": "jüdisches", "function": ["NOUN"], "category": "religion"},
{"keyword": "judentum", "function": ["NOUN"], "category": "religion"},
{"keyword": "muslim", "function": ["NOUN"], "category": "religion"},
{"keyword": "muslima", "function": ["NOUN"], "category": "religion"},
{"keyword": "muslimisch", "function": ["ADJ"], "category": "religion"},
{"keyword": "muslimische", "function": ["ADJ"], "category": "religion"},
{"keyword": "muslimischen", "function": ["ADJ"], "category": "religion"},
{"keyword": "muslimischer", "function": ["ADJ"], "category": "religion"},
{"keyword": "muslimisches", "function": ["ADJ"], "category": "religion"},
{"keyword": "muslime", "function": ["NOUN"], "category": "religion"},
{"keyword": "islam", "function": ["NOUN"], "category": "religion"},
{"keyword": "islamisch", "function": ["ADJ"], "category": "religion"},
{"keyword": "islamische", "function": ["ADJ"], "category": "religion"},
{"keyword": "islamischen", "function": ["ADJ"], "category": "religion"},
{"keyword": "islamischer", "function": ["ADJ"], "category": "religion"},
{"keyword": "islamisches", "function": ["ADJ"], "category": "religion"},
],
}


CONCEPTS_EN = {
"nationality": [
{"keyword": "california", "function": ["NOUN"], "category": "region"},
{"keyword": "afghanistan", "function": ["NOUN"], "category": "country"},
Expand Down
Loading

0 comments on commit 543f675

Please sign in to comment.