Product Variety and Concentration

14/06/2021

Data sources

Examples

AcousticBrainz

nrow(abrainz)
## [1] 502725
sample_n(abrainz, 5)
##                               recording                             relgroup
## 1: b779a4a6-d38a-4ade-9b25-d26ed7be5db0 46424940-f85a-43d7-8b79-63fe88191492
## 2: 393ebe3b-3bb8-4aee-80c1-6f4aa73d1149 94e679b3-f313-41c0-ac7b-6b7a2a08f5c3
## 3: 00247995-636f-402f-98a8-6d320a776741 e039c64b-d782-479e-bc64-c826deb8ade5
## 4: 8de59146-733a-4ebd-8bb1-f4246a8fa593 f7b59bee-62d7-4bc8-ad47-cda988cd8ded
## 5: 48ac29c0-4006-499e-b26b-04e19708bfbc 3b2e5548-9ce3-4b3d-852f-15b5defe4eb1
##    year key_key key_scale key_strength chords_key chords_scale danceability
## 1: 2010      G#     major       0.6834         G#        major       1.3306
## 2: 2009       E     minor       0.7729          G        major       1.0043
## 3: 2019       G     major       0.5575          E        minor       1.2043
## 4: 2013       D     minor       0.6840          D        minor       0.9911
## 5: 2008       F     minor       0.7153          F        minor       1.7653
##         bpm beats_count   length
## 1: 134.9412        1027 447.9216
## 2: 103.9364         400 220.9437
## 3: 119.9266         430 216.0000
## 4: 140.0551         635 282.4620
## 5: 127.9201         916 429.5600

ListenBrainz

nrow(lbrainz)
## [1] 235508172
sample_n(lbrainz, 5)
##              user                            recording      stamp year
## 1: Unfoldingdrama 08576508-6757-4084-8fe9-c523fc3ef420 1274757710 2010
## 2:        vinnidp d410420d-ac85-4c8a-8895-7bf99fa50751 1260481106 2009
## 3:       Clunkb0t 053e6fc0-aa42-4ee9-a8b0-2d8978dc71ed 1493758790 2017
## 4:        Gitrian 070a99c8-75e1-4566-8ba0-bf5799181bce 1363693320 2013
## 5: MousetrapPling 398d93ad-685b-3d4b-b2a7-848b54e14b15 1196952798 2007

Discogs

nrow(discogs)
## [1] 75285
sample_n(discogs, 5)
##                                relgroup year  master   match           genre
## 1: 4beb905f-1c51-4d6f-a1a3-3d34a8db3db0 2010  325474  325474        Rock;Pop
## 2: 4b47d73f-5437-4816-8958-3017938fdc27 2011  393599  393599        Rock;Pop
## 3: 3e18991d-18e0-4268-a41f-99f6d031c63c 2012 1124311 1124311 Electronic;Rock
## 4: 3aa65db9-a211-42be-93d2-4a68db5fc54b 2020 1894587 1894587            Rock
## 5: e11fd9e9-7cd5-444b-bf5c-f5185f1f8ff8 2007  321623  321623         Hip Hop
##                           style
## 1:             Alternative Rock
## 2:                       Ballad
## 3: Indie Rock;Synth-pop;Electro
## 4:             Classic Rock;AOR
## 5:                         <NA>

Data preparation

K <- detectCores() - 1L
path <-
  list(
    lbdump = "../shared/listenbrainz/listenbrainz-listens-dump-284-20201201-180002-full/listens",
    mbdump = "../shared/musicbrainz/mbdump",
    dcdump = "../shared/discogs/discogs_20210501_",
    relgroup_local = "../shared/musicbrainz/relgroup",
    relgroup_remote = "https://musicbrainz.org/release-group",
    abapi_local = "../shared/acousticbrainz/abapi",
    abapi_remote = "https://acousticbrainz.org/api/v1/low-level?recording_ids"
  )

ListenBrainz

read_lbdump <- function (x) {
  files <- paste(path$lbdump, x, sep = "/") %>%
    list.files(full.names = T)
  registerDoParallel(K)
  dump <-
    foreach(i = files, .combine = "rbind") %dopar% {
      raw <- readLines(i)
      data.table(
        user = str_extract(raw, '\"user_name\":[^,]+') %>%
          str_replace_all('\"user_name\":|\"', ""),
        recording = str_extract(raw, '\"recording_mbid\":[^,]+') %>%
          str_extract(
            "[[:alnum:]]{8}-[[:alnum:]]{4}-[[:alnum:]]{4}-[[:alnum:]]{4}-[[:alnum:]]{12}"
          ),
        stamp = str_extract(raw, '\"timestamp\":[^,]+') %>%
          str_extract("[[:digit:]]+") %>% as.integer(),
        year = x
      )[!is.na(recording)]
    }
  stopImplicitCluster()
  dump[order(stamp)]
}
lbrainz <- read_lbdump(2006)
for (i in 2007:2020) {
  lbrainz <- rbind(lbrainz, read_lbdump(i))
}

MusicBrainz

read_mbdump <- function (x) {
  paste(path$mbdump, x, sep = "/") %>%
    read.csv(
      sep = "\t",
      quote = "",
      as.is = T,
      header = F
    ) %>%
    data.table()
}
mbrainz <- list()
mbrainz$recording <- read_mbdump("recording") %>%
  select(recording_id = V1, recording = V2)
mbrainz$track <- read_mbdump("track") %>%
  select(
    track_id = V1,
    track = V2,
    recording_id = V3,
    medium_id = V4
  )
mbrainz$medium <- read_mbdump("medium") %>%
  select(medium_id = V1, release_id = V2)
mbrainz$release <- read_mbdump("release") %>%
  select(release_id = V1,
         release = V2,
         relgroup_id = V5)
mbrainz$relgroup <- read_mbdump("release_group") %>%
  select(relgroup_id = V1, relgroup = V2)
mbrainz$relcountry <- read_mbdump("release_country") %>%
  select(
    release_id = V1,
    country_id = V2,
    year = V3,
    month = V4,
    day = V5
  )
mbrainz$country <- read_mbdump("area") %>%
  select(country_id = V1, country = V3)
mbrainz <-
  left_join(mbrainz$track, mbrainz$recording, by = "recording_id") %>%
  left_join(mbrainz$medium, by = "medium_id") %>%
  left_join(mbrainz$release, by = "release_id") %>%
  left_join(mbrainz$relgroup, by = "relgroup_id") %>%
  left_join(mbrainz$relcountry, by = "release_id") %>%
  left_join(mbrainz$country, by = "country_id") %>%
  select(recording, release, relgroup, year, month, day, country)
mbrainz$year <-
  ifelse(mbrainz$year == "\\N", NA, mbrainz$year) %>% as.integer()
mbrainz$month <-
  ifelse(mbrainz$month == "\\N", NA, mbrainz$month) %>% as.integer()
mbrainz$day <-
  ifelse(mbrainz$day == "\\N", NA, mbrainz$day) %>% as.integer()

Matching Discogs

discogs <-
  data.table(mbrainz, key = "relgroup")[, list(year = min(year)), by = relgroup]
discogs <- discogs[year > 2005]
for (i in substr(discogs$relgroup, 1, 2) %>% unique()) {
  j <- paste(path$relgroup_local, i, sep = "/")
  if (!dir.exists(j)) {
    dir.create(j)
  }
}
fetch_relgroup <- function (x) {
  local <-
    paste(path$relgroup_local, "/", substr(x, 1, 2), "/", x, ".html", sep = "")
  remote <- paste(path$relgroup_remote, x, sep = "/")
  if (!file.exists(local) | file.size(local) == 0) {
    if (url.exists(remote)) {
      download.file(remote, local, method = "auto", quiet = T)
      "remote"
    } else {
      "missing"
    }
  } else {
    "local"
  }
}
mcmapply(fetch_relgroup, discogs$relgroup, mc.cores = K) %>%
  table()
read_relgroup <- function (x) {
  local <-
    paste(path$relgroup_local, "/", substr(x, 1, 2), "/", x, ".html", sep = "")
  if (file.exists(local)) {
    ns <- read_html(local) %>%
      xml_find_all(".//li") %>%
      xml_find_all(".//a")
    xref <- as.character(ns[grepl("/master/", ns)])
    if (length(xref) > 0) {
      str_extract(xref, "master/[^\"]+") %>%
        str_remove("master/") %>%
        paste(collapse = ",")
    } else {
      NA
    }
  } else {
    NA
  }
}
discogs$master <-
  mcmapply(read_relgroup, discogs$relgroup, mc.cores = round(K / 2))
discogs <- discogs[!is.na(master)]
discogs$match <- str_extract(discogs$master, "^[[:digit:]]+") %>%
  as.integer()

Discogs

read_dcdump <- function (x) {
  paste(path$dcdump, x, ".xml", sep = "") %>%
    read_xml() %>% xml_children()
}
master <- read_dcdump("masters")
master <-
  master[as.integer(unlist(xml_attrs(master))) %in% discogs$match] %>%
  as.character()
master <-
  data.table(
    id = str_extract(master, "<master id=\"[:digit:]+\">") %>%
      str_extract("[:digit:]+") %>% as.integer(),
    xml = master
  )
match_genre <- function (x) {
  xml <- master[id == x]$xml
  if (length(xml) > 0) {
    if (str_detect(xml, regex("<genres>.*</genres>", dotall = T))) {
      str_extract(xml, regex("<genres>.*</genres>", dotall = T)) %>%
        str_extract_all(">.*<") %>% paste(sep =  "") %>%
        str_replace_all("&amp;", "&") %>%
        str_replace_all("\", ", ";") %>%
        str_remove_all("[><()\"]|^c")
    } else {
      NA
    }
  } else {
    NA
  }
}
discogs$genre <-
  mcmapply(match_genre, discogs$match, mc.cores = K)
match_style <- function (x) {
  xml <- master[id == x]$xml
  if (length(xml) > 0) {
    if (str_detect(xml, regex("<styles>.*</styles>", dotall = T))) {
      str_extract(xml, regex("<styles>.*</styles>", dotall = T)) %>%
        str_extract_all(">.*<") %>% paste(sep =  "") %>%
        str_replace_all("&amp;", "&") %>%
        str_replace_all("\", ", ";") %>%
        str_remove_all("[><()\"]|^c")
    } else {
      NA
    }
  } else {
    NA
  }
}
discogs$style <-
  mcmapply(match_style, discogs$match, mc.cores = K)

AcousticBrainz

abrainz <- data.table(mbrainz[relgroup %in% discogs$relgroup],
                      key = "recording")[, list(relgroup, year = min(year)), by = recording] %>%
  distinct()
mbids <-
  split(abrainz$recording, ceiling(seq_along(abrainz$recording) / 25))
for (i in substr(abrainz$recording, 1, 2) %>% unique()) {
  j <- paste(path$abapi_local, i, sep = "/")
  if (!dir.exists(j)) {
    dir.create(j)
  }
}
fetch_abapi_bulk <- function (x) {
  remote <-
    paste(path$abapi_remote, paste(mbids[[x]], collapse = ":0;"), sep = "=") %>%
    read_json()
  for (i in mbids[[x]]) {
    if (i %in% names(remote)) {
      local <-
        paste(path$abapi_local,
              "/",
              substr(i, 1, 2),
              "/",
              i,
              ".json",
              sep = "")
      if (!file.exists(local) | file.size(local) == 0) {
        write_json(remote[[i]]$`0`, local)
      }
    }
  }
  gc()
  x
}
mcmapply(fetch_abapi, 1:length(mbids), mc.cores = round(K / 2))
fetch_abapi <- function (x) {
  local <-
    paste(path$abapi_local, "/", substr(x, 1, 2), "/", x, ".json", sep = "")
  if (!file.exists(local) | file.size(local) == 0) {
    Sys.sleep(1)
    remote <- paste(path$abapi_remote, "=", x, ":0", sep = "") %>%
      read_json()
    if (!is.null(remote[[x]])) {
      write_json(remote[[x]]$`0`, local)
      "remote"
    } else {
      "missing"
    }
  } else {
    "local"
  }
}
mcmapply(fetch_abapi, abrainz$recording, mc.cores = round(K / 2)) %>%
  table()
abrainz <-
  abrainz[mcmapply(function (x) {
    paste(path$abapi_local, "/", substr(x, 1, 2), "/", x, ".json", sep = "") %>%
      file.exists()
  }, abrainz$recording, mc.cores = K)]
read_abfeat <- function (x, y) {
  if (grepl("key|scale", y)) {
    x <- x$tonal
  } else {
    if (y == "length") {
      x <- x$metadata$audio_properties
    } else {
      x <- x$rhythm
    }
  }
  ifelse(y %in% names(x), x[[y]], NA)
}
registerDoParallel(K)
abfeats <-
  foreach(i = abrainz$recording, .combine = "rbind") %dopar% {
    local <-
      paste(path$abapi_local, "/", substr(i, 1, 2), "/", i, ".json", sep = "") %>%
      read_json(local, simplifyVector = T)
    data.table(
      key_key = read_abfeat(local, "key_key"),
      key_scale = read_abfeat(local, "key_scale"),
      key_strength = read_abfeat(local, "key_strength"),
      chords_key = read_abfeat(local, "chords_key"),
      chords_scale = read_abfeat(local, "chords_scale"),
      danceability = read_abfeat(local, "danceability"),
      bpm = read_abfeat(local, "bpm"),
      beats_count = read_abfeat(local, "beats_count"),
      length = read_abfeat(local, "length")
    )
  }
stopImplicitCluster()
abrainz <- cbind(abrainz, abfeats)

Sample and variables

nicheyr <-
  expand.grid(
    niche = str_split(discogs$style, ";") %>% unlist() %>%
      na.omit() %>% unique() %>% sort(),
    year = min(lbrainz$year):max(lbrainz$year)
  ) %>% data.table()
lbrainz <- lbrainz[recording %in% abrainz$recording]
abrainz <- dummy_cols(
  abrainz[complete.cases(abrainz)],
  c("key_key", "key_scale", "chords_key", "chords_scale"),
  remove_first_dummy = T,
  remove_selected_columns = T
) %>% select(c(1:3, 9:32, 4:8))

Product variety

count_prod <- function (x, y) {
  relgr <-
    discogs[grepl(paste(x, c(";", "$"), sep = "", collapse = "|"), style) &
              year == y]$relgroup
  rec <- abrainz[relgroup %in% relgr]$recording
  unique(rec) %>% length()
}
nicheyr$noprod <-
  mcmapply(count_prod, nicheyr$niche, nicheyr$year, mc.cores = K)

Within-niche concentration

within_conc <- function (x, y) {
  relgr <-
    discogs[grepl(paste(x, c(";", "$"), sep = "", collapse = "|"), style) &
              year == y]$relgroup
  rec <- abrainz[relgroup %in% relgr]$recording
  lis <- lbrainz[recording %in% rec]
  if (nrow(lis[year == y]) > 0) {
    table(lis[year == y]$recording) %>% conc()
  } else {
    NA
  }
}
nicheyr$wconc <-
  mcmapply(within_conc, nicheyr$niche, nicheyr$year, mc.cores = K)

Across-niche concentration

across_conc <- function (y) {
  relgr <- discogs[year == y, c("relgroup", "style")]
  rec <-
    abrainz[relgroup %in% relgr$relgroup, c("recording", "relgroup")]
  lis <- lbrainz[recording %in% rec$recording]
  lis <- lis[year == y]
  joined <-
    left_join(lis, rec, by = "recording")[, c("recording", "relgroup")]
  sty <- left_join(joined, relgr, by = "relgroup")$style
  str_split(sty, ";") %>% unlist() %>% table() %>% conc()
}
registerDoParallel(K)
nicheyr <- foreach(i = min(nicheyr$year):max(nicheyr$year),
        .combine = "rbind") %dopar% {
          data.table(year = i, xconc = across_conc(i))
        } %>% right_join(nicheyr, by = "year")
stopImplicitCluster()
nicheyr <- nicheyr[, c(3, 1, 4:5, 2)]

Correlations

cor.test(nicheyr$noprod, nicheyr$wconc)
## 
##  Pearson's product-moment correlation
## 
## data:  nicheyr$noprod and nicheyr$wconc
## t = -16.868, df = 3334, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3113788 -0.2488373
## sample estimates:
##        cor 
## -0.2804056
cor.test(nicheyr$noprod, nicheyr$xconc)
## 
##  Pearson's product-moment correlation
## 
## data:  nicheyr$noprod and nicheyr$xconc
## t = -2.9869, df = 7333, p-value = 0.002828
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05769848 -0.01198300
## sample estimates:
##         cor 
## -0.03485898
cor.test(nicheyr$wconc, nicheyr$xconc)
## 
##  Pearson's product-moment correlation
## 
## data:  nicheyr$wconc and nicheyr$xconc
## t = 6.4824, df = 3334, p-value = 1.036e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07792568 0.14495429
## sample estimates:
##       cor 
## 0.1115669
plot_grid(
  ggplot(nicheyr[!is.na(wconc)]) +
    geom_point(aes(noprod, wconc), alpha = .3) +
    labs(x = "No. products in niche-year", y = "Within-niche concentration"),
  ggplot(nicheyr) +
    geom_point(aes(noprod, xconc), alpha = .3) +
    labs(x = "No. products in niche-year", y = "Across-niche concentration")
)