Open Speech Emotion Recognition Leaderboard

Welcome to the Open SER Leaderboard — part of the CAMEO project!

This leaderboard tracks how well different models recognize emotions in speech across multiple languages.

Everything is open, transparent, and reproducible - you're invited to explore, evaluate, and contribute.

This tab shows how different models perform across the entire CAMEO collection. You’ll find macro F1, weighted F1, and accuracy scores for each model, tested at different temperature settings.

It's a great place to get a quick overview of how models compare on the full dataset.


1


1


1


1

🔢 Evaluate your model

To evaluate your model according to the methodology used in our paper, you can use the following code.

import os
import string

from Levenshtein import ratio
from datasets import load_dataset, Dataset, concatenate_datasets
from sklearn.metrics import classification_report, f1_score, accuracy_score

# 🔧 Change this path to where your JSONL prediction files are stored
outputs_path = "./"

_DATASETS = [
    "cafe", "crema_d", "emns", "emozionalmente", "enterface",
    "jl_Corpus", "mesd", "nemo", "oreau", "pavoque",
    "ravdess", "resd", "subesco",
]

THRESHOLD = 0.57


def get_expected(split: str) -> tuple[set, str, dict]:
    """Load expected emotion labels and language metadata from CAMEO dataset."""
    ds = load_dataset("amu-cai/CAMEO", split=split)
    return set(ds["emotion"]), ds["language"][0], dict(zip(ds["file_id"], ds["emotion"]))


def process_outputs(dataset_name: str) -> tuple[Dataset, set, str]:
    """Clean and correct predictions, returning a Dataset with fixed predictions."""
    outputs = Dataset.from_json(os.path.join(outputs_path, f"{dataset_name}.jsonl"))
    options, language, expected = get_expected(dataset_name)

    def preprocess(x):
        return {
            "predicted": x["predicted"].translate(str.maketrans('', '', string.punctuation)).lower().strip(),
            "expected": expected.get(x["file_id"]),
        }

    outputs = outputs.map(preprocess)

    def fix_prediction(x):
        if x["predicted"] in options:
            x["fixed_prediction"] = x["predicted"]
        else:
            predicted_words = x["predicted"].split()
            label_scores = {
                label: sum(r for r in (ratio(label, word) for word in predicted_words) if r > THRESHOLD)
                for label in options
            }
            x["fixed_prediction"] = max(label_scores, key=label_scores.get)
        return x

    outputs = outputs.map(fix_prediction)
    return outputs, options, language


def calculate_metrics(outputs: Dataset, labels: set) -> dict:
    """Compute classification metrics."""
    y_true = outputs["expected"]
    y_pred = outputs["fixed_prediction"]

    return {
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "weighted_f1": f1_score(y_true, y_pred, average="weighted"),
        "accuracy": accuracy_score(y_true, y_pred),
        "metrics_per_label": classification_report(
            y_true, y_pred, target_names=sorted(labels), output_dict=True
        ),
    }


# 🧮 Main Evaluation Loop
results = []
outputs_per_language = {}
full_outputs, full_labels = None, set()

for dataset in _DATASETS:
    jsonl_path = os.path.join(outputs_path, f"{dataset}.jsonl")

    if not os.path.isfile(jsonl_path):
        print(f"Jsonl file for {dataset} not found.")
        continue

    outputs, labels, language = process_outputs(dataset)
    metrics = calculate_metrics(outputs, labels)
    results.append({"language": language, "dataset": dataset, **metrics})

    if language not in outputs_per_language:
        outputs_per_language[language] = {"labels": labels, "outputs": outputs}
    else:
        outputs_per_language[language]["labels"] |= labels
        outputs_per_language[language]["outputs"] = concatenate_datasets([
            outputs_per_language[language]["outputs"], outputs
        ])

    full_outputs = outputs if full_outputs is None else concatenate_datasets([full_outputs, outputs])
    full_labels |= labels

# 🔤 Per-language evaluation
for language, data in outputs_per_language.items():
    metrics = calculate_metrics(data["outputs"], data["labels"])
    results.append({"language": language, "dataset": "all", **metrics})

# 🌍 Global evaluation
if full_outputs is not None:
    metrics = calculate_metrics(full_outputs, full_labels)
    results.append({"language": "all", "dataset": "all", **metrics})

# 💾 Save results
Dataset.from_list(results).to_json(os.path.join(outputs_path, "results.jsonl"))

@misc{christop2025cameocollectionmultilingualemotional,
  title={CAMEO: Collection of Multilingual Emotional Speech Corpora}, 
  author={Iwona Christop and Maciej Czajka},
  year={2025},
  eprint={2505.11051},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2505.11051}, 
}

@inproceedings{cafe,
  author = {Gournay, Philippe and Lahaie, Olivier and Lefebvre, Roch},
  title = {{A Canadian French Emotional Speech Dataset}},
  year = {2018},
  isbn = {9781450351928},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3204949.3208121},
  doi = {10.1145/3204949.3208121},
  booktitle = {Proceedings of the 9th ACM Multimedia Systems Conference},
  pages = {399–402},
  numpages = {4},
  keywords = {canadian french, digital recording, emotional speech, speech dataset},
  location = {Amsterdam, Netherlands},
  series = {MMSys '18}
}

@article{cremad,
  author = {Cao, Houwei and Cooper, David and Keutmann, Michael and Gur, Ruben and Nenkova, Ani and Verma, Ragini},
  year = {2014},
  month = {10},
  pages = {377-390},
  title = {{CREMA-D: Crowd-sourced emotional multimodal actors dataset}},
  volume = {5},
  journal = {IEEE transactions on affective computing},
  doi = {10.1109/TAFFC.2014.2336244}
}

@misc{emns,
  title={{EMNS /Imz/ Corpus: An emotive single-speaker dataset for narrative storytelling in games, television and graphic novels}},
  author={Kari Ali Noriy and Xiaosong Yang and Jian Jun Zhang},
  year={2023},
  eprint={2305.13137},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13137},
}

@article{emozionalmente,
  author = {Catania, Fabio and Wilke, Jordan and Garzotto, Franca},
  year = {2025},
  month = {01},
  pages = {1-14},
  title = {{Emozionalmente: A Crowdsourced Corpus of Simulated Emotional Speech in Italian}},
  volume = {PP},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  doi = {10.1109/TASLPRO.2025.3540662}
}

@INPROCEEDINGS{enterface,
  author={Martin, O. and Kotsia, I. and Macq, B. and Pitas, I.},
  booktitle={22nd International Conference on Data Engineering Workshops (ICDEW'06)},
  title={{The eNTERFACE' 05 Audio-Visual Emotion Database}},
  year={2006},
  volume={},
  number={},
  pages={8-8},
  keywords={Audio databases;Image databases;Emotion recognition;Spatial databases;Visual databases;Signal processing algorithms;Protocols;Speech analysis;Humans;Informatics},
  doi={10.1109/ICDEW.2006.145}}

@inproceedings{jlcorpus,
  author = {James, Jesin and Tian, Li and Watson, Catherine},
  year = {2018},
  month = {09},
  pages = {2768-2772},
  title = {{An Open Source Emotional Speech Corpus for Human Robot Interaction Applications}},
  doi = {10.21437/Interspeech.2018-1349}
}

@inproceedings{mesd,
  author = {Duville, Mathilde Marie and Alonso-Valerdi, Luz and Ibarra-Zarate, David I.},
  year = {2021},
  month = {12},
  pages = {},
  title = {{The Mexican Emotional Speech Database (MESD): elaboration and assessment based on machine learning}},
  volume = {2021},
  doi = {10.1109/EMBC46164.2021.9629934}
}

@article{mesd2,
  author = {Duville, Mathilde Marie and Alonso-Valerdi, Luz and Ibarra-Zarate, David I.},
  year = {2021},
  month = {12},
  pages = {},
  title = {{Mexican Emotional Speech Database Based on Semantic, Frequency, Familiarity, Concreteness, and Cultural Shaping of Affective Prosody}},
  volume = {6},
  journal = {Data},
  doi = {10.3390/data6120130}
}

@inproceedings{christop-2024-nemo,
  title = "n{EMO}: Dataset of Emotional Speech in {P}olish",
  author = "Christop, Iwona",
  editor = "Calzolari, Nicoletta  and
    Kan, Min-Yen  and
    Hoste, Veronique  and
    Lenci, Alessandro  and
    Sakti, Sakriani  and
    Xue, Nianwen",
  booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
  month = may,
  year = "2024",
  address = "Torino, Italia",
  publisher = "ELRA and ICCL",
  url = "https://aclanthology.org/2024.lrec-main.1059/",
  pages = "12111--12116",
  abstract = "Speech emotion recognition has become increasingly important in recent years due to its potential applications in healthcare, customer service, and personalization of dialogue systems. However, a major issue in this field is the lack of datasets that adequately represent basic emotional states across various language families. As datasets covering Slavic languages are rare, there is a need to address this research gap. This paper presents the development of nEMO, a novel corpus of emotional speech in Polish. The dataset comprises over 3 hours of samples recorded with the participation of nine actors portraying six emotional states: anger, fear, happiness, sadness, surprise, and a neutral state. The text material used was carefully selected to represent the phonetics of the Polish language adequately. The corpus is freely available under the terms of a Creative Commons license (CC BY-NC-SA 4.0)."
}

@MISC{oreau,
  title = {{French emotional speech database - Or{\'e}au}},
  author = {Kerkeni, Leila and Cleder, Catherine and Serrestou, Youssef and
               Raoof, Kosai},
  abstract = {This document presents the French emotional speech database -
               Or{\'e}au, recorded in a quiet environment. The database is
               designed for general study of emotional speech and analysis of
               emotion characteristics for speech synthesis purposes. It
               contains 79 utterances which could be used in everyday life in
               the classroom. Between 10 and 13 utterances were written for
               each of the 7 emotions in French language by 32 non-professional
               speakers. 2 versions are available, the first one contains 502
               sentences. A perception test was performed to evaluate the
               recognition of emotions and their naturalness. 90\% of
               utterances (434 utterances) were correctly identified and
               retained after the test and various analyses, which constitutes
               the second version of database.},
  publisher = {Zenodo},
  year      =  {2020}
}

@inproceedings{pavoque,
  author = {Steiner, Ingmar and Schröder, Marc and Klepp, Annette},
  title = {{The PAVOQUE corpus as a resource for analysis and synthesis of expressive speech}},
  booktitle = {Phonetik & Phonologie 9. Phonetik & Phonologie (P&P-9), October 11-12, Zurich, Switzerland},
  year = {2013},
  month = {10},
  pages = {83--84},
  organization = {UZH},
  publisher = {Peter Lang}
}

@article{ravdess,
  doi = {10.1371/journal.pone.0196391},
  author = {Livingstone, Steven R. AND Russo, Frank A.},
  journal = {PLOS ONE},
  publisher = {Public Library of Science},
  title = {{The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English}},
  year = {2018},
  month = {05},
  volume = {13},
  url = {https://doi.org/10.1371/journal.pone.0196391},
  pages = {1-35},
  abstract = {The RAVDESS is a validated multimodal database of emotional speech and song. The database is gender balanced consisting of 24 professional actors, vocalizing lexically-matched statements in a neutral North American accent. Speech includes calm, happy, sad, angry, fearful, surprise, and disgust expressions, and song contains calm, happy, sad, angry, and fearful emotions. Each expression is produced at two levels of emotional intensity, with an additional neutral expression. All conditions are available in face-and-voice, face-only, and voice-only formats. The set of 7356 recordings were each rated 10 times on emotional validity, intensity, and genuineness. Ratings were provided by 247 individuals who were characteristic of untrained research participants from North America. A further set of 72 participants provided test-retest data. High levels of emotional validity and test-retest intrarater reliability were reported. Corrected accuracy and composite "goodness" measures are presented to assist researchers in the selection of stimuli. All recordings are made freely available under a Creative Commons license and can be downloaded at https://doi.org/10.5281/zenodo.1188976.},
  number = {5},
}

@misc{resd,
  author = {Artem Amentes and Nikita Davidchuk and Ilya Lubenets},
  title = {{Russian Emotional Speech Dialogs with annotated text}},
  year = {2022},
  publisher = {Hugging Face},
  journal = {Hugging Face Hub},
  howpublished = {\url{https://huggingface.co/datasets/Aniemore/resd_annotated}},
}

@article{subesco,
  doi = {10.1371/journal.pone.0250173},
  author = {Sultana, Sadia AND Rahman, M. Shahidur AND Selim, M. Reza AND Iqbal, M. Zafar},
  journal = {PLOS ONE},
  publisher = {Public Library of Science},
  title = {{SUST Bangla Emotional Speech Corpus (SUBESCO): An audio-only emotional speech corpus for Bangla}},
  year = {2021},
  month = {04},
  volume = {16},
  url = {https://doi.org/10.1371/journal.pone.0250173},
  pages = {1-27},
  abstract = {SUBESCO is an audio-only emotional speech corpus for Bangla language. The total duration of the corpus is in excess of 7 hours containing 7000 utterances, and it is the largest emotional speech corpus available for this language. Twenty native speakers participated in the gender-balanced set, each recording of 10 sentences simulating seven targeted emotions. Fifty university students participated in the evaluation of this corpus. Each audio clip of this corpus, except those of Disgust emotion, was validated four times by male and female raters. Raw hit rates and unbiased rates were calculated producing scores above chance level of responses. Overall recognition rate was reported to be above 70% for human perception tests. Kappa statistics and intra-class correlation coefficient scores indicated high-level of inter-rater reliability and consistency of this corpus evaluation. SUBESCO is an Open Access database, licensed under Creative Common Attribution 4.0 International, and can be downloaded free of charge from the web link: https://doi.org/10.5281/zenodo.4526477.},
  number = {4},
}

Open Speech Emotion Recognition Leaderboard

📝 About

🔢 Evaluate your model

📬 Submit Here!