Analysis • babynamesIL

library(babynamesIL)
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tgstat)
theme_set(theme_classic())

Israeli baby names

Distribution of names

We will start by looking at the distribution total number of babies for each name:

babynamesIL_totals %>%
    mutate(sector = factor(sector, levels = c("Jewish", "Muslim", "Christian", "Druze", "Other"))) %>%
    ggplot(aes(x = total, color = sex)) +
    ggsci::scale_color_aaas() +
    geom_density() +
    scale_x_log10() +
    facet_grid(. ~ sector) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Note that the x axis is in log scale.

Top names

Top 20 names in each sex and sector:

babynamesIL_totals %>%
    mutate(sector = factor(sector, levels = c("Jewish", "Muslim", "Christian", "Druze", "Other"))) %>%
    group_by(sector, sex) %>%
    slice_max(order_by = total, n = 20) %>%
    arrange(sector, sex, desc(total)) %>%
    mutate(name = forcats::fct_inorder(name)) %>%
    ggplot(aes(x = name, y = total)) +
    geom_col() +
    facet_wrap(sector ~ sex, scales = "free", ncol = 2) +
    ylab("total #") +
    xlab("") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Names over time

a single name

babynamesIL %>%
    tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>%
    filter(name == "מעיין", sector == "Jewish") %>%
    ggplot(aes(x = year, y = prop, color = sex)) +
    geom_line() +
    ggsci::scale_color_aaas() +
    scale_y_continuous(labels = scales::percent) +
    ggtitle("מעיין הבן מול מעיין הבת") +
    theme_classic()

clustering

We will then create a matrix of the names and their frequencies over time. We will start with Jewish female babies.

names_mat <- babynamesIL %>%
    filter(sector == "Jewish", sex == "F") %>%
    select(year, name, prop) %>%
    spread(year, prop, fill = 0) %>%
    column_to_rownames("name") %>%
    as.matrix()
dim(names_mat)
#> [1] 1503   76

Normalize each name:

mat_norm <- names_mat / rowSums(names_mat)

Select only names with at least 500 babies:

mat_norm_f <- mat_norm[babynamesIL_totals %>%
    filter(sector == "Jewish", sex == "F") %>%
    filter(total >= 500) %>%
    pull(name), ]
dim(mat_norm_f)
#> [1] 503  76

Cluster:

hc <- tgs_cor(t(mat_norm_f)) %>%
    tgs_dist() %>%
    hclust(method = "ward.D2")

Reorder the clustering by year:

hc <- as.hclust(reorder(
    as.dendrogram(hc),
    apply(mat_norm_f, 1, which.max),
    agglo.FUN = mean
))

Plot the matrix:

text_mat <- babynamesIL %>%
    filter(sector == "Jewish", sex == "F") %>%
    tidyr::complete(sector, year, sex, name, fill = list(n = 0)) %>%
    mutate(text = paste(name, paste0("year: ", year), paste0("n: ", n), sep = "\n")) %>%
    select(year, name, text) %>%
    spread(year, text) %>%
    column_to_rownames("name") %>%
    as.matrix()
plotly::plot_ly(z = mat_norm_f[hc$order, ], y = rownames(mat_norm_f)[hc$order], x = colnames(mat_norm_f), type = "heatmap", colors = colorRampPalette(c("white", "blue", "red", "yellow"))(1000), hoverinfo = "text", text = text_mat[hc$order, ]) %>%
    plotly::layout(yaxis = list(title = ""), xaxis = list(title = "Year"))

We will wrap it all in a function:

cluster_names <- function(sector, sex, min_total = 500, colors = colorRampPalette(c
                          ("white", "blue", "red", "yellow"))(1000)) {
    names_mat <- babynamesIL %>%
        filter(sector == !!sector, sex == !!sex) %>%
        select(year, name, prop) %>%
        spread(year, prop, fill = 0) %>%
        column_to_rownames("name") %>%
        as.matrix()
    text_mat <- babynamesIL %>%
        filter(sector == !!sector, sex == !!sex) %>%
        tidyr::complete(sector, year, sex, name, fill = list(n = 0)) %>%
        mutate(text = paste(name, paste0("year: ", year), paste0("n: ", n), sep = "\n")) %>%
        select(year, name, text) %>%
        spread(year, text) %>%
        column_to_rownames("name") %>%
        as.matrix()
    mat_norm <- names_mat / rowSums(names_mat)
    mat_norm_f <- mat_norm[babynamesIL_totals %>%
        filter(sector == !!sector, sex == !!sex) %>%
        filter(total >= min_total) %>%
        pull(name), ]
    text_mat <- text_mat[rownames(mat_norm_f), colnames(mat_norm_f)]
    hc <- tgs_cor(t(mat_norm_f)) %>%
        tgs_dist() %>%
        hclust(method = "ward.D2")
    hc <- as.hclust(reorder(
        as.dendrogram(hc),
        apply(mat_norm_f, 1, which.max),
        agglo.FUN = mean
    ))
    plotly::plot_ly(z = mat_norm_f[hc$order, ], y = rownames(mat_norm_f)[hc$order], x = colnames(mat_norm_f), type = "heatmap", colors = colors, hoverinfo = "text", text = text_mat[hc$order, ]) %>%
        plotly::layout(yaxis = list(title = ""), xaxis = list(title = "Year"))
}

We can now plot also the Male names:

cluster_names("Jewish", "M")

Or other sectors:

cluster_names("Muslim", "M")

cluster_names("Muslim", "F")

cluster_names("Christian", "M", 50)

cluster_names("Christian", "F", 50)

cluster_names("Druze", "M", 50)

cluster_names("Druze", "F", 50)

cluster_names("Other", "M", 50)

cluster_names("Other", "F", 50)

Unisex names

We can plot names that are used for both male and female in a given year, e.g. 2022:

babynamesIL %>%
    filter(sector == "Jewish", year == 2022) %>%
    pivot_wider(names_from = "sex", values_from = c("n", "prop"), values_fill = 0) %>%
    filter(n_M > 0 & n_F > 0) %>%
    ggplot(aes(x = n_M, y = n_F, label = name)) +
    geom_point() +
    scale_x_log10() +
    scale_y_log10() +
    ggrepel::geom_text_repel() +
    geom_abline()

Or we can use the matrices we created before to find patterns in the ratio between male and female over time:

cluster_unisex_names <- function(sector, colors = colorRampPalette(c("blue", "white", "red"))(1000), epsilon = 1e-3) {
    mat_M <- babynamesIL %>%
        filter(sector == !!sector, sex == "M") %>%
        tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>%
        select(year, name, prop) %>%
        spread(year, prop, fill = 0) %>%
        column_to_rownames("name") %>%
        as.matrix()
    mat_F <- babynamesIL %>%
        filter(sector == !!sector, sex == "F") %>%
        tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>%
        select(year, name, prop) %>%
        spread(year, prop, fill = 0) %>%
        column_to_rownames("name") %>%
        as.matrix()
    uni_names <- intersect(rownames(mat_M), rownames(mat_F))
    ratio_mat <- log2(mat_M[uni_names, ] + epsilon) - log2(mat_F[uni_names, ] + epsilon)
    text_mat <- babynamesIL %>%
        filter(sector == !!sector) %>%
        tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>%
        pivot_wider(names_from = "sex", values_from = c("n", "prop"), values_fill = 0) %>%
        mutate(
            text =
                paste(name,
                    paste0("year: ", year),
                    paste0("# of male: ", n_M),
                    paste0("# of female: ", n_F),
                    paste0("% of male: ", scales::percent(prop_M)),
                    paste0("% of female: ", scales::percent(prop_F)),
                    sep = "\n"
                )
        ) %>%
        select(year, name, text) %>%
        spread(year, text) %>%
        column_to_rownames("name") %>%
        as.matrix()
    text_mat <- text_mat[rownames(ratio_mat), colnames(ratio_mat)]
    colors <- colorRampPalette(c("blue", "white", "red"))(1000)
    hc <- tgs_cor(t(ratio_mat)) %>%
        tgs_dist() %>%
        hclust(method = "ward.D2")
    hc <- as.hclust(reorder(
        as.dendrogram(hc),
        apply(ratio_mat, 1, which.max),
        agglo.FUN = mean
    ))
    n_names <- length(uni_names)
    plotly::plot_ly(z = ratio_mat[hc$order, ], y = rownames(ratio_mat)[hc$order], x = colnames(ratio_mat), type = "heatmap", colors = colors, hoverinfo = "text", text = text_mat[hc$order, ]) %>%
        plotly::layout(title = paste0(n_names, " unisex names from the ", sector, " sector"), yaxis = list(title = ""), xaxis = list(title = "Year"))
}

Run the function - red is more male names and blue is more female names:

cluster_unisex_names("Jewish")

cluster_unisex_names("Muslim")

cluster_unisex_names("Christian")

cluster_unisex_names("Druze")

Names that are growing in a short period of time

We can look at names that are growing in popularity in a short period of time, e.g. a single year.

growth_names <- babynamesIL %>%
    arrange(sector, sex, name, year) %>%
    filter(lead(n) >= 100) %>% # take only names with at least 100 babies
    group_by(sector, name, sex) %>%
    mutate(next_n = lead(n), growth = next_n / n) %>%
    ungroup() %>%
    filter(growth >= 2) %>%
    arrange(desc(growth))
head(growth_names)
#> # A tibble: 6 × 8
#>   sector  year sex   name       n     prop next_n growth
#>   <chr>  <dbl> <chr> <chr>  <int>    <dbl>  <int>  <dbl>
#> 1 Muslim  1974 M     וסאם      21 0.00286     329  15.7 
#> 2 Muslim  2008 F     גינא      21 0.00144     215  10.2 
#> 3 Jewish  2002 F     אגם       17 0.000390    172  10.1 
#> 4 Jewish  1980 F     סיון     100 0.00312     954   9.54
#> 5 Muslim  1986 F     רהאם      21 0.00284     200   9.52
#> 6 Jewish  1985 M     אליאור    24 0.000659    219   9.12

nrow(growth_names)
#> [1] 124

Plot:

growth_names %>%
    filter(sector == "Jewish") %>%
    rename(`Number of babies` = next_n) %>%
    ggplot(aes(x = year + 1, y = growth, size = `Number of babies`, label = name, color = sex)) +
    geom_point() +
    theme_classic() +
    scale_y_log10() +
    ggsci::scale_color_aaas() +
    ggrepel::geom_text_repel(size = 6) +
    scale_x_continuous(breaks = seq(1948, 2023, 5)) +
    xlab("Year") +
    ylab("Growth")
#> Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps

Declining names

We can look for names that declined the most:

decline_names_overall <- babynamesIL %>%
    arrange(sector, sex, name, year) %>%
    group_by(sector, name, sex) %>%
    summarise(max_n = max(n), min_n = min(n), max_year = year[which.max(n)], min_year = year[which.min(n)], decline = 1 - (min_n / max_n)) %>%
    ungroup() %>%
    filter(max_n >= 100, max_year < min_year) %>%
    filter(decline >= 0.95)
#> `summarise()` has grouped output by 'sector', 'name'. You can override using
#> the `.groups` argument.

decline_names_overall %>%
    arrange(max_n)
#> # A tibble: 79 × 8
#>    sector name  sex   max_n min_n max_year min_year decline
#>    <chr>  <chr> <chr> <int> <int>    <dbl>    <dbl>   <dbl>
#>  1 Jewish עירית F       100     5     1971     1998   0.95 
#>  2 Jewish לירן  F       101     5     1984     2023   0.950
#>  3 Muslim היא   F       102     5     1993     2014   0.951
#>  4 Muslim ראניה F       102     5     1979     2020   0.951
#>  5 Jewish זיוה  F       103     5     1960     1989   0.951
#>  6 Jewish ורדה  F       105     5     1957     1980   0.952
#>  7 Jewish קובי  M       112     5     1981     2007   0.955
#>  8 Jewish עידית F       116     5     1973     1994   0.957
#>  9 Muslim סמאח  F       120     6     1976     2019   0.95 
#> 10 Jewish ירדנה F       121     5     1967     1983   0.959
#> # ℹ 69 more rows

Plot:

decline_names_overall %>%
    ggplot(aes(x = max_year, y = max_n, label = name, color = sex)) +
    geom_point() +
    theme_classic() +
    ggsci::scale_color_aaas() +
    ggrepel::geom_text_repel() +
    xlab("Year") +
    scale_y_log10() +
    ylab("Number of babies")