Projects

Contents

Projects#

This page shows stats about the Open Seeds projects

import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud

baseurl = "https://raw.githubusercontent.com/open-life-science/open-life-science.github.io/main/_data/artifacts/openseeds/"
project_df = (
    pd.read_csv(f"{baseurl}projects.csv", index_col=0, na_filter= False)
    .assign(
        participants=lambda df: df.participants.str.split(", "),
        participantNb=lambda df: df.participants.str.len(),
        mentors=lambda df: df.mentors.str.split(", "),
        keywords=lambda df: df.keywords.str.split(", "),
        cohort=lambda df: "OLS-" + df.cohort.astype(str),
    )
)

Number of projects over all cohorts

len(project_df)

Cohorts#

cohort_df = (
    project_df
    .groupby(by="cohort")
    .count()
    .drop(columns = ["participants", "mentors", "description", "keywords", "status", "collaboration", "participantNb"])
    .rename(columns = {"name": "Total"})
)

Mean number of projects per cohort

cohort_df.Total.mean()

29.625

fig, ax = plt.subplots()
fig.set_dpi(300)
(cohort_df.Total
     .transpose()
     .plot.bar(ax=ax, color="#139D3D"))
plt.ylabel("Number of projects")
plt.xlabel("")

Text(0.5, 0, '')

../_images/ea66e654e3c5d616e432cd29c6057bd45f58084729a6279017a5bc95007a6f9b.png

Participants#

Mean number of participants per projects

project_df.participantNb.mean()

1.7468354430379747

Distribution of the number of participants per project

fig, ax = plt.subplots()
fig.set_dpi(300)
project_df.participantNb.plot.hist(
    bins=8, ax=ax, legend=False, color="#139D3D"
)
plt.xlabel('Number of participants per projects')

Text(0.5, 0, 'Number of participants per projects')

../_images/746d8673b64ce84e830b489b9b95992c2e72a62b1cf26b816e7e9871b5246b3a.png

cohort_df = (project_df
    .drop(columns = ["name", "participants", "mentors", "description", "keywords", "status", "collaboration"])
    .groupby(by="cohort")
    .mean()
)
fig, ax = plt.subplots()
fig.set_dpi(300)
(cohort_df.participantNb
     .transpose()
     .plot.bar(ax=ax, color="#139D3D"))
plt.ylabel("Mean number of participants per project")
plt.xlabel("")

Text(0.5, 0, '')

../_images/e3b00e0b8006cfb0b267ebe93e2efcd308bbbede7d3869cf6fe5cdcd0f052f4c.png

Keywords#

keyword_df = (project_df
    .drop(columns = ["participantNb", "participants", "mentors", "description", "status", "cohort", "collaboration"])
    .explode("keywords")
    .assign(keywords=lambda df: df.keywords.str.capitalize())
    .replace("Community building", "Community")
    .replace("Research community", "Community")
    .replace("Ethics of ai", "Ethical AI")
    .replace("Ethical ai", "Ethical AI")
    .replace("Enviromental", "Environmental science")
    .replace("Equal opportunity", "Equality")
    .replace("Training", "Training and education")
    .replace("Education", "Training and education")
    .replace("Artificial intelligence", "AI")
    .replace("Ai", "AI")
    .replace("Fair", "FAIR")
    .replace("Open-source", "Open source")
    .replace("Open source software", "Open source")
    .replace("Opensource", "Open source")
    .replace("Os", "Open source")
    .replace("Open source projects", "Open source")
    .replace(" data science", "Data science")
    .replace("Visualisation", "Data visualisation")
    .replace("Next-generation sequencing", "Sequencing")
    .replace("Open educational resource", "Open education")
    .replace("Reproducible research", "Reproducibility")
    .replace("Data", "Data science")
    .replace("Open community", "Community")
    .groupby(by="keywords")
    .count()
    .rename(columns={"name": "Frequency"})
    .sort_values("Frequency", ascending=False)
)

keyword_df.head(10)

	Frequency
keywords
	63
Community	35
Open science	19
Training and education	19
Open source	18
Reproducibility	13
Data science	12
Machine learning	9
Bioinformatics	9
AI	8

frec = keyword_df.Frequency.to_dict()

wc = WordCloud(
    background_color="rgba(255, 255, 255, 0)",
    random_state=42,
    width=600,
    height=400,
)

wordcloud = wc.generate_from_frequencies(frec)

fig, ax = plt.subplots(figsize=(13, 5))
ax.imshow(wc)

plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

../_images/799bc489b64a65bd50a48d53be2b8d794a1fedadd6922c8fddd9a2b9f710e0b7.png