In order to get the data, you need the tableau URL which is in this case :
https://public.tableau.com/views/S07StuP58/Dashboard1?:embed=y&:showVizHome=no
The flow is the following :
call the following url :
GET https://public.tableau.com/views/S07StuP58/Dashboard1?:embed=y&:showVizHome=no
extract the JSON content from the textarea
with id tsConfigContainer
build the url with the session_id
POST https://public.tableau.com/{vizql_path}/bootstrapSession/sessions/{session_id}
extract the JSON data from the response which is not JSON originally (regex to split the data)
extract the data from the large JSON configuration, this is not straightforward since all the strings data are located in a single array. You need to get the data indices from various fields in order to be able to split the data into columns and then build your dataframe
The following code will prompt the user to choose a worksheet (by index), parse the data and put it in a dataframe :
library(rvest)
library(rjson)
library(httr)
library(stringr)
#replace the hostname and the path if necessary
host_url <- "https://public.tableau.com"
path <- "/views/S07StuP58/Dashboard1"
body <- read_html(modify_url(host_url,
path = path,
query = list(":embed" = "y",":showVizHome" = "no")
))
data <- body %>%
html_nodes("textarea#tsConfigContainer") %>%
html_text()
json <- fromJSON(data)
url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")
extract <- str_match(data, "\d+;(\{.*\})\d+;(\{.*\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])
worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)
for(i in 1:length(worksheets)){
print(paste("[",i,"] ",worksheets[i], sep=""))
}
selected <- readline(prompt="select worksheet by index: ");
worksheet <- worksheets[as.integer(selected)]
print(paste("you selected :", worksheet, sep=" "))
columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
if (is.null(t[["fieldCaption"]]) == FALSE) {
paneIndex <- t$paneIndices
columnIndex <- t$columnIndices
if (length(t$paneIndices) > 1){
paneIndex <- t$paneIndices[1]
}
if (length(t$columnIndices) > 1){
columnIndex <- t$columnIndices[1]
}
result[[i]] <- list(
fieldCaption = t[["fieldCaption"]],
valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices,
dataType = t[["dataType"]],
stringsAsFactors = FALSE
)
i <- i + 1
}
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
cstring <- list();
for(t in dataFull) {
if(t$dataType == "cstring"){
cstring <- t
break
}
}
data_index <- 1
name_index <- 1
frameData <- list()
frameNames <- c()
for(t in dataFull) {
for(index in result) {
if (t$dataType == index["dataType"]){
if (length(index$valueIndices) > 0) {
j <- 1
vector <- character(length(index$valueIndices))
for (it in index$valueIndices){
vector[j] <- t$dataValues[it+1]
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
if (length(index$aliasIndices) > 0) {
j <- 1
vector <- character(length(index$aliasIndices))
for (it in index$aliasIndices){
if (it >= 0){
vector[j] <- t$dataValues[it+1]
} else {
vector[j] <- cstring$dataValues[abs(it)]
}
j <- j + 1
}
frameData[[data_index]] <- vector
frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
data_index <- data_index + 1
name_index <- name_index + 1
}
}
}
}
df <- NULL
lengthList <- c()
for(i in 1:length(frameNames)){
lengthList[i] <- length(frameData[[i]])
}
max <- max(lengthList)
for(i in 1:length(frameNames)){
if (length(frameData[[i]]) < max){
len <- length(frameData[[i]])
frameData[[i]][(len+1):max]<-""
}
df[frameNames[[i]]] <- frameData[i]
}
options(width = 1200)
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)
which gives the following output :
X.Student.Aid.Program.Type..value X.Student.Aid.Program..value X..Measure.Names..alias X.Multiple.Values..alias
1 Grants/Scholarships Subtotal Graduate Amount Awarded $XXXXXXXX
2 Grants/Scholarships Other (External) Grants/Scholarships Graduate Amount Awarded $XXXXXX
3 Grants/Scholarships Miscellaneous Tuition/Fee Payments/Waivers Graduate Amount Awarded $XXXXXX
4 Grants/Scholarships Graduate Assistantship Tuition Waivers Graduate Amount Awarded $XXXXXX
5 Grants/Scholarships Athletic Scholarships Graduate Amount Awarded $XXXXXX
6 Grants/Scholarships Institutional Grants/Scholarships Graduate Amount Awarded $XXXXXX
7 Grants/Scholarships State (Other) Grants/Scholarships Graduate Amount Awarded $XXXXXX
8 Grants/Scholarships Zell Miller Scholarships Graduate Amount Awarded $XXXXXX
9 Grants/Scholarships HOPE Scholarships Graduate Amount Awarded $XXXXXX
10 Grants/Scholarships Federal (Other) Grants/Scholarships Graduate Amount Awarded
11 Grants/Scholarships Federal Supplemental Grants Graduate Amount Awarded
12 Grants/Scholarships Federal Pell Grants Graduate Amount Awarded
13 Grants/Scholarships Subtotal Graduate Number of Awards XXXXXX
14 Grants/Scholarships Other (External) Grants/Scholarships Graduate Number of Awards XXX
The python equivalent of the code above would be :
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
#replace the hostname and the path if necessary
host_url = "https://public.tableau.com"
path = "/views/S07StuP58/Dashboard1"
url = f"{host_url}{path}"
r = requests.get(
url,
params= {
":embed": "y",
":showVizHome": "no"
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{host_url}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
dataReg = re.search('d+;({.*})d+;({.*})', r.text, re.MULTILINE)
info = json.loads(dataReg.group(1))
data = json.loads(dataReg.group(2))
worksheets = list(data["secondaryInfo"]["presModelMap"]["vizData"]["presModelHolder"]["genPresModelMapPresModel"]["presModelMap"].keys())
for idx, ws in enumerate(worksheets):
print(f"[{idx}] {ws}")
selected = input("select worksheet by index: ")
worksheet = worksheets[int(selected)]
print(f"you selected : {worksheet}")
columnsData = data["secondaryInfo"]["presModelMap"]["vizData"]["presModelHolder"]["genPresModelMapPresModel"]["presModelMap"][worksheet]["presModelHolder"]["genVizDataPresModel"]["paneColumnsData"]
result = [
{
"fieldCaption": t.get("fieldCaption", ""),
"valueIndices": columnsData["paneColumnsList"][t["paneIndices"][0]]["vizPaneColumns"][t["columnIndices"][0]]["valueIndices"],
"aliasIndices": columnsData["paneColumnsList"][t["paneIndices"][0]]["vizPaneColumns"][t["columnIndices"][0]]["aliasIndices"],
"dataType": t.get("dataType"),
"paneIndices": t["paneIndices"][0],
"columnIndices": t["columnIndices"][0]
}
for t in columnsData["vizDataColumns"]
if t.get("fieldCaption")
]
dataFull = data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"]["genDataDictionaryPresModel"]["dataSegments"]["0"]["dataColumns"]
def onAlias(it, value, cstring):
return value[it] if (it >= 0) else cstring["dataValues"][abs(it)-1]
frameData = {}
cstring = [t for t in dataFull if t["dataType"] == "cstring"][0]
for t in dataFull:
for index in result:
if (t["dataType"] == index["dataType"]):
if len(index["valueIndices"]) > 0:
frameData[f'{index["fieldCaption"]}-value'] = [t["dataValues"][abs(it)] for it in index["valueIndices"]]
if len(index["aliasIndices"]) > 0:
frameData[f'{index["fieldCaption"]}-alias'] = [onAlias(it, t["dataValues"], cstring) for it in index["aliasIndices"]]
df = pd.DataFrame.from_dict(frameData, orient='index').fillna(0).T
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
print(df)
<a href="https: