Purpose of this script is to create a smaller subsection of the full Epigraphic Database Heidelberg (EDH) dataset that can be further explored for the purpose of the SNA within the Past Social Network Project, Aarhus University. A basic exploration of useful attributes connected to people on inscriptions, such as gender, age, social status, is included.
WARNING! The following code was designed for the 2021 version. As some attributes in the 2022 version changed, you may have to alter the code below where necessary.
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(jsonlite)
library(stringi)
# getwd() # check you working directory
dir.create("../data")
dir.create("../data/large_data")
# download as a local copy from Zenodo
# version 2021 (81476)
# download.file("https://zenodo.org/record/4888168/files/EDH_text_cleaned_2021-01-21.json?download=1", "../data/large_data/EDH_text_cleaned_2021-01-21.json")
########### WARNING! In case you get an error message, you may have to manually download the dataset from Zenodo and save it to the 'large_data' folder. Don't forget to check if the dataset or the entire folder is in your .gitignore file, otherwise you are risking accidental commit to GitHub and than having to deal with removing the large file from your commit.
# version 2022 (81883)
# download.file("https://zenodo.org/record/7303886/files/EDH_text_cleaned_2022-11-03.json?download=1", "../data/large_data/EDH_text_cleaned_2022-11-03.json")
########### WARNING! The following code was designed for the 2021 version. As some attributes in the 2022 version changed, you may have to alter the code below where necessary.
# 2021
list_json <- jsonlite::fromJSON("../data/large_data/EDH_text_cleaned_2021-01-21.json")
# 2022
#list_json <- jsonlite::fromJSON("../data/large_data/EDH_text_cleaned_2022-11-03.json")
EDH <- as_tibble(list_json)
EDHs<- EDH %>%
select(id,
coordinates, findspot_ancient_clean, province_label_clean, modern_region_clean,
not_before, not_after,
type_of_monument_clean, material_clean,
type_of_inscription_clean,
transcription, clean_text_interpretive_word,
people
) %>%
separate(col = coordinates, into = c("latitude", "longitude"), sep = ",")
EDHs$longitude <- as.numeric(str_replace(EDHs$longitude, pattern = "\\)", replacement=""))
EDHs$latitude <- as.numeric(str_replace(EDHs$latitude, pattern = "c\\(", replacement=""))
EDHs$people <- map(EDHs$people, as.data.frame)
EDH_people<- EDHs %>% unnest_longer(col = people, keep_empty = TRUE)
EDH_people<- EDH_people %>%
unnest_wider(people)
nrow(EDH_people)
## [1] 127822
#extracting only inscriptions with people in them
EDH_people <- EDH_people %>%
filter(!is.na(name))
nrow(EDH_people)
## [1] 92399
head(EDH_people, 10)
summary(as.numeric(EDH_people$person_id))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 4.234 3.000 244.000
EDH_people %>%
count(gender, sort = TRUE) %>%
mutate(ratio = round(n/(sum(n)/100),2))
EDH_people$status %>%
unique()
## [1] NA
## [2] "senatorial order"
## [3] "slaves"
## [4] "freedmen / freedwomen"
## [5] "freedmen / freedwomen?"
## [6] "slaves?"
## [7] "senatorial order?"
## [8] "decurial order, higher local offices"
## [9] "military personnel"
## [10] "equestrian order"
## [11] "decurial order, higher local offices?"
## [12] "equestrian order?"
## [13] "military personnel?"
## [14] "Augustales"
## [15] "emperor / imperial household?"
## [16] "Augustales; freedmen / freedwomen"
## [17] "equestrian order?; decurial order, higher local offices"
## [18] "decurial order, higher local offices; freedmen / freedwomen"
## [19] "equestrian order; decurial order, higher local offices"
## [20] "lower local offices, administration of imperial estates"
## [21] "equestrian order; freedmen / freedwomen"
## [22] "decurial order, higher local offices; military personnel"
## [23] "Augustales?"
## [24] "equestrian order; military personnel"
## [25] "rulers (foreign)"
## [26] "senatorial order; equestrian order"
## [27] "emperor / imperial household; equestrian order"
## [28] "decurial order, higher local offices; Augustales"
## [29] "lower local offices, administration of imperial estates; freedmen / freedwomen"
## [30] "senatorial order; decurial order, higher local offices"
## [31] "equestrian order?; decurial order, higher local offices?"
## [32] "Augustales; decurial order, higher local offices"
## [33] "decurial order, higher local offices; military personnel?"
## [34] "lower local offices, administration of imperial estates?"
## [35] "decurial order, higher local offices; equestrian order"
## [36] "decurial order, higher local offices; Augustales?"
## [37] "emperor / imperial household; decurial order, higher local offices"
## [38] "decurial order, higher local offices?; lower local offices, administration of imperial estates"
## [39] "freedmen / freedwomen; military personnel"
## [40] "equestrian order; decurial order, higher local offices; military personnel"
## [41] "decurial order, higher local offices; lower local offices, administration of imperial estates"
## [42] "lower local offices, administration of imperial estates; military personnel"
## [43] "decurial order, higher local offices?; military personnel"
## [44] "equestrian order?; military personnel?"
## [45] "lower local offices, administration of imperial estates; Augustales"
## [46] "equestrian order; decurial order, higher local offices?"
## [47] "senatorial order?; equestrian order?"
## [48] "decurial order, higher local offices?; freedmen / freedwomen"
## [49] "Augustales?; freedmen / freedwomen"
## [50] "equestrian order?; lower local offices, administration of imperial estates"
str_split_fixed(EDH_people$status, ";", n=3) %>%
as.data.frame() -> status
status %>%
cbind(combined = c(status$V1,status$V2,status$V3)) %>%
filter(combined != "") %>%
mutate(combined_clean = str_replace_all(string = combined, pattern = "\\?", replacement = "")) %>%
mutate(combined_clean = str_replace_all(string = combined_clean, pattern = "^ ", replacement = "")) %>%
count(combined_clean, sort=TRUE) -> status_counts
status_counts
status_counts %>%
mutate(combined_clean = reorder(combined_clean, n)) %>%
ggplot(aes(y=combined_clean, x=n, fill=combined_clean)) +
geom_col(width=0.8, stat="identity") +
coord_cartesian(xlim=c(0,10000)) +
labs(x = "Number of instances", y = "Status category", title = "Overview of status references in the EDH dataset", subtitle = ggtitle(paste("n =", nrow(EDHs), "inscriptions"))) +
geom_label(aes(label= n)) +
theme_linedraw(base_size = 13) +
theme(legend.position = "none")
dir.create("../figures")
ggsave("../figures/Status_overview.jpg", width = 12, height = 8)
ggsave("../figures/Status_overview.png", width = 12, height = 8)
Before we can display the age of people, we need to streamline the age as it is recorded as character and distributed over several attributes, such as age in years, months, days. Then convert them to numeric value and save as total age in years.
EDH_people<- EDH_people %>%
mutate(age_years = as.numeric(str_extract(EDH_people$'age: years', pattern = "[:digit:]+"))) %>%
mutate(age_months = as.numeric(str_extract(EDH_people$'age: months', pattern = "[:digit:]+"))) %>%
mutate(age_days = as.numeric(str_extract(EDH_people$'age: days', pattern = "[:digit:]+"))) %>%
mutate(age_hours = as.numeric(str_extract(EDH_people$'age: hours', pattern = "[:digit:]+"))) %>%
mutate(months_to_years = age_months / 12) %>%
mutate(days_to_years = age_days / 365) %>%
mutate(hours_to_years = age_hours / (24*365)) %>%
replace_na(list(months_to_years = 0, days_to_years = 0, hours_to_years = 0)) %>%
mutate(total_age = age_years + months_to_years + days_to_years + hours_to_years) %>%
dplyr::select(-ends_with("to_years")) %>%
dplyr::select(-starts_with("age: "))
summary(EDH_people$total_age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 17.00 30.00 32.76 46.00 200.00 84938
length(na.omit(EDH_people$total_age))/(nrow(EDH_people)/100)
## [1] 8.074763
Before we can display the origin of people, we need to streamline the text and clean it from uncertainty symbols and other variations. That way, we can get as close to the original place name as possible.
Desription from the EDH website:
Geographical information concerning the origin of a person named in the inscription
spelling guide:
usually as it is in the text of the inscription
(e.g. "Alba Pompeia", "natione Surus domo Hemesa", "ex Italia / τῶν ἐξ Ἰταλίας", "ex provincia Asia civitate Focia")
in the nominative only when adjectives are used (e.g. "Ulisitanus", "Emeritensis", "Ῥωμαῖος")
correct spelling without brackets
Designation of resolutions and supplements / erasures:
* = resolved abbreviation
+ = supplied
++ = erased, but still readable
+++ = erased and no longer readable
EDH_people <- EDH_people %>%
mutate(origo_clean = str_replace_all(origo, pattern="[\\*|\\+|\\?|\\!]", replacement="")) %>%
mutate(origo_clean = str_replace_all(origo_clean, pattern="(\\w+) \\(= (\\w+)\\)", replacement="\\2")) %>%
mutate(origo_clean = str_replace_all(origo_clean, pattern="gente|genitus|gentis|natus|civis|Civis|civus|natione|nato|regione|domo|domu|cives|cive|civi|ex|verna|tate", replacement="")) %>%
mutate(origo_clean = str_replace_all(origo_clean, pattern="^ ", replacement="")) %>%
mutate(origo_clean = str_replace_all(origo_clean, pattern=" ", replacement=" "))
table(EDH_people$origo_clean) %>%
sort(decreasing = TRUE) %>%
as.data.frame()
Each individual person is identified on the basis of the ‘name’ attribute. Here is the attribute description as found at the EDH website:
Name (including filiation), Tribus and Origo of a person named in the inscription
spelling:
as found and preserved in the inscriptio
include supplements, but without resolution of abbreviations
Emperors: the nomenclature "Imperator Caesar ... Augustus" is only indicated in the database for the Julian / Claudian emperors; "without Epithets (Pius, Felix, Invictus) or victory names (Parthicus maximus etc.)
restoration_interpretive <- c("[\\[*\\]]", "")
substitution_edh_interpretive <- c("([α-ωΑ-Ωa-zA-Z])=([α-ωΑ-Ωa-zA-Z])", "\\1")
#substitution_interpretive <- c("[\\<*\\>]", "")
arabic_numerals <- c("[0-9]+", "")
erasure_new_text <- c("[〚〛]", "")
#erasure_new_text_triple <- c("\\[\\[\\[|\\]\\]\\]", "")
#erasure_new_text_double <- c("\\[\\[|\\]\\]", "")
cleaning_interpretive_word_edh <- function(epigraphic_dataset){
clean_text <- gsub(pattern=substitution_edh_interpretive[1], replacement=substitution_edh_interpretive[2], x=epigraphic_dataset, perl=TRUE)
#clean_text <- gsub(pattern=substitution_interpretive[1], replacement=substitution_interpretive[2], x=clean_text, perl=TRUE)
clean_text <- gsub(pattern=erasure_new_text[1], replacement=erasure_new_text[2], x=clean_text, perl=TRUE)
#clean_text <- gsub(pattern=erasure_new_text_triple[1], replacement=erasure_new_text_triple[2], x=clean_text, perl=TRUE)
#clean_text <- gsub(pattern=erasure_new_text_double[1], replacement=erasure_new_text_double[2], x=clean_text, perl=TRUE)
clean_text <- gsub(pattern=arabic_numerals[1], replacement=arabic_numerals[2], x=clean_text, perl=TRUE)
return(clean_text)
}
EDH_people<- EDH_people %>%
mutate(name_clean = cleaning_interpretive_word_edh(EDH_people$name))
input_string <- "[[[P. Septimius Geta]]]"
input_string <- "[[Geta]]"
input_string <- "[[---]]"
result <- gsub("\\[\\[\\[|\\]\\]\\]", "", input_string, perl=TRUE)
result <- gsub("\\[\\[|\\]\\]", "", input_string, perl=TRUE)
cat(result)
## ---
EDH_people %>%
count(name_clean, name, sort=F) %>%
View()
Some people do have the attribute ‘name’ empty, but there are still relevant information. However, for the SNA they should be removed, otherwise they will create connections and relationships where there were none.
# review inscriptions with people with an empty name in [---], [[---]], [[[---]]]
empty_name_string <- c("[---]", "[[---]]", "[[[---]]]")
# remove people with no name, use at caution
EDH_people_rn<- EDH_people %>%
filter(!name_clean %in% empty_name_string)
Description as found at the EDH website:
Nomen / Nomina of a person named in the inscription.
Here one can search for the occurrence of a nomen independent of case, spelling and state of preservation on the inscription bearer.
Spelling:
- Always in the nominative irrespective of the case in the inscription.
- correct spelling without brackets
- In the case of the emperors up to and including Nero Caesar is treated as a part of the name.
Resolution of abbreviations and supplements / erasures are not indicate by round or square brackets ( ), [ ], [[ ]], [[[ ]]], but through:
* = resolved
+ = supplemented
++ = erased, but still readable
+++ = erased and no longer readable
EDH_people %>%
count(nomen, sort=T)
EDH_people<- EDH_people %>%
mutate(nomen_clean = str_replace_all(EDH_people$nomen, "\\+{1,3}", ""))
EDH_people<- EDH_people %>%
mutate(nomen_clean = str_replace_all(EDH_people$nomen_clean, "\\*", ""))
EDH_people<- EDH_people %>%
mutate(nomen_clean = str_replace_all(EDH_people$nomen_clean, "[\\+\\*]", ""))
EDH_people<- EDH_people %>%
mutate(nomen_clean = str_replace_all(EDH_people$nomen_clean, "[\\!]", ""))
EDH_people<- EDH_people %>%
mutate(nomen_clean = str_replace_all(EDH_people$nomen_clean, "\\)\\?", "\\)"))
EDH_people<- EDH_people %>%
mutate(nomen_clean = str_replace_all(EDH_people$nomen_clean, "\\?", ""))
library(stringi)
# Convert the encoding of the nomen_clean column
EDH_people$nomen_clean <- iconv(EDH_people$nomen_clean, from = "UTF-8", to = "windows-1252")
EDH_people$nomen_clean <- iconv(EDH_people$nomen_clean, from = "windows-1252", to = "UTF-8")
EDH_people$name <- iconv(EDH_people$name, from = "UTF-8", to = "windows-1252")
#EDH_people$name <- iconv(EDH_people$name, from = "windows-1252", to = "UTF-8")
EDH_people %>%
count(nomen_clean, sort=T)
# creating a unique ID number for people
EDH_people$id_people <- 1:nrow(EDH_people)
# renaming variable describing the number of people on one inscription
EDH_people <-rename(EDH_people, person_number_insc = person_id)
Ordering the attributes so they are ordered logically
EDH_people<- EDH_people %>%
select(id, latitude, longitude, findspot_ancient_clean, province_label_clean, modern_region_clean, not_before, not_after, type_of_monument_clean, material_clean, type_of_inscription_clean, transcription, clean_text_interpretive_word, id_people, person_number_insc, name, praenomen, nomen, name_clean, nomen_clean, cognomen, supernomen, gender, origo, status, total_age, age_years, age_months, age_months, age_hours)
dir.create("../data")
write.csv(EDH_people, "../data/EDH_people_2021.csv", row.names=TRUE, sep = ";")
Aarhus University, Denmark, https://orcid.org/0000-0002-6349-0540↩︎