One of my data inputs looks like this:
O75663 O95456 O75663 O95456
O95400 O95670 O95400 O95670
O95433 O95433 O95801
O95456 P00352
O95670
df<- structure(
list(
V1 = structure(c(2L, 3L, 4L, 1L, 1L),
.Label = c("", "O75663", "O95400", "O95433"),
class = "factor"),
V2 = structure(c(2L, 3L, 1L, 1L, 1L),
.Label = c("", "O95456", "O95670"),
class = "factor"),
V3 = structure(1:5,
.Label = c("O75663", "O95400", "O95433", "O95456", "O95670"),
class = "factor"),
V4 = structure(c(2L, 3L, 4L, 5L, 1L),
.Label = c("", "O95456", "O95670", "O95801", "P00352"),
class = "factor")),
.Names = c("V1", "V2", "V3", "V4"),
class = "data.frame",
row.names = c(NA, -5L))
My second data input looks like this:
O75663
O95400
O95433
O95456
O95670
O95801
P00352
P00492
I want to know for each string from the second data set in which columns of the first data it can be found. It might be in none of them, or several. I want the output to look like the following:
strings column ids
O75663 1, 3
O95400 1, 3
O95433 1, 3
O95456 2, 3, 4
O95670 2, 3, 4
O95801 4
P00352 4
P00492 NA
The new strs
:
strs <- structure(
list(
strings = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 1L, 1L),
.Label = c("", "O75663", "O95400", "O95433", "O95456", "O95670", "O95801"),
class = "factor"),
strings2 = structure(c(4L, 2L, 6L, 5L, 3L, 1L, 1L, 1L),
.Label = c("", "O75663", "O95433", "O95456", "P00352", "P00492"),
class = "factor"),
strings3 = structure(c(4L, 6L, 7L, 8L, 2L, 3L, 5L, 1L),
.Label = c("", "O75663", "O95400", "O95456", "O95670", "O95801", "P00352", "P00492"),
class = "factor"),
strings4 = structure(c(2L, 5L, 3L, 4L, 1L, 1L, 1L, 1L),
.Label = c("", "O95400", "O95456", "O95801", "P00492"),
class = "factor"),
strings5 = structure(c(8L, 2L, 7L, 1L, 3L, 6L, 5L, 4L),
.Label = c("O75663", "O95400", "O95433", "O95456", "O95670", "O95801", "P00352", "P00492"),
class = "factor")),
.Names = c("strings", "strings2", "strings3", "strings4", "strings5"),
class = "data.frame",
row.names = c(NA, -8L))
lut <- structure(
list(
V1 = c("O75663", "O95400", "O95433", NA, NA),
V2 = c("O95456", "O95670", NA, NA, NA),
V3 = c("O75663", "O95400", "O95433", "O95456", "O95670"),
V4 = c("O95456", "O95670", "O95801", "P00352", NA),
V1 = c("O75663", "O95400", "O95433", NA, NA),
V2 = c("O95456", "O95670", NA, NA, NA),
V3 = c("O75663", "O95400", "O95433", "O95456", "O95670"),
V4 = c("O95456", "O95670", "O95801", "P00352", NA)),
.Names = c("V1", "V2", "V3", "V4", "V1", "V2", "V3", "V4"),
row.names = c(NA, -5L), class = "data.frame")
df<- setDT(strs)[, paste0('colids_',seq_along(strs)) :=
lapply(.SD, function(x) toString(which(colSums(lut == x, na.rm=TRUE) > 0))),
by = 1:nrow(strs)][]
Then I get this error:
See Question&Answers more detail:osError in
df1 == x
: comparison of these types is not implemented
In addition: Warning message: Inis.data.frame(x)
:
Incompatible methods ("Ops.data.frame
", "Ops.factor
") for "==
"