r - Finding the index based on two data frames of strings

Question

Ask a Question

Welcome To Ask or Share your Answers For Others

r - Finding the index based on two data frames of strings

asked Jan 31, 2022 in Technique[技术] by 深蓝 (71.8m points)

One of my data inputs looks like this:

O75663  O95456  O75663  O95456 
O95400  O95670  O95400  O95670
O95433          O95433  O95801 
                O95456  P00352
                O95670  

df<- structure(
    list(
        V1 = structure(c(2L, 3L, 4L, 1L, 1L), 
            .Label = c("", "O75663", "O95400", "O95433"), 
                class = "factor"), 
        V2 = structure(c(2L, 3L, 1L, 1L, 1L), 
            .Label = c("", "O95456", "O95670"), 
                class = "factor"), 
        V3 = structure(1:5, 
            .Label = c("O75663", "O95400", "O95433", "O95456", "O95670"), 
                class = "factor"), 
        V4 = structure(c(2L, 3L, 4L, 5L, 1L), 
            .Label = c("", "O95456", "O95670", "O95801", "P00352"), 
                class = "factor")), 
    .Names = c("V1", "V2", "V3", "V4"), 
        class = "data.frame", 
        row.names = c(NA, -5L))

My second data input looks like this:

I want to know for each string from the second data set in which columns of the first data it can be found. It might be in none of them, or several. I want the output to look like the following:

strings    column ids 
O75663      1, 3
O95400      1, 3
O95433      1, 3
O95456      2, 3, 4
O95670      2, 3, 4
O95801      4
P00352      4
P00492      NA

The new strs:

strs <- structure(
    list(
        strings = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 1L, 1L), 
            .Label = c("", "O75663", "O95400", "O95433", "O95456", "O95670", "O95801"),
                class = "factor"), 
        strings2 = structure(c(4L, 2L, 6L, 5L, 3L, 1L, 1L, 1L), 
            .Label = c("", "O75663", "O95433", "O95456", "P00352", "P00492"), 
                class = "factor"), 
        strings3 = structure(c(4L, 6L, 7L, 8L, 2L, 3L, 5L, 1L), 
            .Label = c("", "O75663", "O95400", "O95456", "O95670", "O95801", "P00352", "P00492"), 
                class = "factor"), 
        strings4 = structure(c(2L, 5L, 3L, 4L, 1L, 1L, 1L, 1L), 
            .Label = c("", "O95400", "O95456", "O95801", "P00492"), 
                class = "factor"), 
        strings5 = structure(c(8L, 2L, 7L, 1L, 3L, 6L, 5L, 4L), 
            .Label = c("O75663", "O95400", "O95433", "O95456", "O95670", "O95801", "P00352", "P00492"), 
                class = "factor")), 
    .Names = c("strings", "strings2", "strings3", "strings4", "strings5"), 
        class = "data.frame", 
        row.names = c(NA, -8L))


lut <- structure(
    list(
        V1 = c("O75663", "O95400", "O95433", NA, NA), 
        V2 = c("O95456", "O95670", NA, NA, NA), 
        V3 = c("O75663", "O95400", "O95433", "O95456", "O95670"), 
        V4 = c("O95456", "O95670", "O95801", "P00352", NA), 
        V1 = c("O75663", "O95400", "O95433", NA, NA), 
        V2 = c("O95456", "O95670", NA, NA, NA), 
        V3 = c("O75663", "O95400", "O95433", "O95456", "O95670"), 
        V4 = c("O95456", "O95670", "O95801", "P00352", NA)), 
    .Names = c("V1", "V2", "V3", "V4", "V1", "V2", "V3", "V4"), 
    row.names = c(NA, -5L), class = "data.frame")

df<- setDT(strs)[, paste0('colids_',seq_along(strs)) := 
    lapply(.SD, function(x) toString(which(colSums(lut == x, na.rm=TRUE) > 0))), 
    by = 1:nrow(strs)][]

Then I get this error:

Error in df1 == x : comparison of these types is not implemented
In addition: Warning message: In is.data.frame(x) :
Incompatible methods ("Ops.data.frame", "Ops.factor") for "=="

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

268 views

1 Answer

深蓝 · Answer 1 · 2022-01-31T07:26:30+0000

A possible solution with base R by using a combination of colSums, which, toString and apply:

strs$colids <- apply(strs, 1, function(x) toString(which(colSums(lut == x, na.rm=TRUE) > 0)))

which gives:

> strs
  strings  colids
1  O75663    1, 3
2  O95400    1, 3
3  O95433    1, 3
4  O95456 2, 3, 4
5  O95670 2, 3, 4
6  O95801       4
7  P00352       4
8  P00492

To see what each part does, start by looking at the output of lut == 'O75663' which will give you a TRUE/FALSE table. By wrapping this in colSums you sum the TRUE/FALSE. A 0 means that there is no match in that column for that string, a number above zero means that there is one or more matches. With which you get the column indexes and by wrapping that in toString you get a character values with indexes of the matching columns.

This approach could also be implemented with either data.table or dplyr:

library(data.table)
setDT(strs)[, colids := toString(which(colSums(lut == strings, na.rm=TRUE) > 0)), by = 1:nrow(strs)][]

library(dplyr)
strs %>% rowwise() %>% mutate(colids = toString(which(colSums(lut == strings, na.rm=TRUE) > 0)))

In response to your comment: An example for multiple columns in strs with data.table:

# create an extra strings column
set.seed(1)
strs$strings2 <- sample(strs$strings)

# create two 'colids' columns
library(data.table)
setDT(strs)[, c('colids1','colids2') := lapply(.SD, function(x) toString(which(colSums(lut == x, na.rm=TRUE) > 0))), by = 1:nrow(strs)][]

which gives:

   strings strings2 colids1 colids2
1:  O75663   O95433    1, 3    1, 3
2:  O95400   P00492    1, 3        
3:  O95433   O95456    1, 3 2, 3, 4
4:  O95456   O95670 2, 3, 4 2, 3, 4
5:  O95670   O75663 2, 3, 4    1, 3
6:  O95801   P00352       4       4
7:  P00352   O95400       4    1, 3
8:  P00492   O95801               4

Used data:

lut <- structure(list(V1 = c("O75663", "O95400", "O95433", NA, NA), 
                      V2 = c("O95456", "O95670", NA, NA, NA), 
                      V3 = c("O75663", "O95400", "O95433", "O95456", "O95670"), 
                      V4 = c("O95456", "O95670", "O95801", "P00352", NA)), 
                 .Names = c("V1", "V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -5L))

strs <- structure(list(strings = c("O75663", "O95400", "O95433", "O95456", "O95670", "O95801", "P00352", "P00492")), 
                  .Names = "strings", class = "data.frame", row.names = c(NA, -8L))

With regard to the extended example you added to your question: The reason you are getting that error is because you are trying to compare factor-variables with character-variables. See the difference in output between sapply(strs,class) and sapply(lut,class):

> sapply(strs,class)
strings1 strings2 strings3 strings4 strings5 
"factor" "factor" "factor" "factor" "factor" 
> sapply(lut,class)
         V1          V2          V3          V4          V5          V6          V7          V8 
"character" "character" "character" "character" "character" "character" "character" "character"

It is therefore necessary to convert the factor's to character's first and then do the comparison. The following code:

library(data.table)
setDT(strs)[, lapply(.SD, as.character)
            ][, paste0('colids.',seq_along(strs)) := lapply(.SD, function(x) toString(which(colSums(lut == x, na.rm=TRUE) > 0))), 
              by = 1:nrow(strs)][]

now gives the correct output:

   strings1 strings2 strings3 strings4 strings5         colids.1         colids.2         colids.3         colids.4         colids.5
1:   O75663   O95456   O95456   O95400   P00492       1, 3, 5, 7 2, 3, 4, 6, 7, 8 2, 3, 4, 6, 7, 8       1, 3, 5, 7                 
2:   O95400   O75663   O95801   P00492   O95400       1, 3, 5, 7       1, 3, 5, 7             4, 8                        1, 3, 5, 7
3:   O95433   P00492   P00352   O95456   P00352       1, 3, 5, 7                              4, 8 2, 3, 4, 6, 7, 8             4, 8
4:   O95456   P00352   P00492   O95801   O75663 2, 3, 4, 6, 7, 8             4, 8                              4, 8       1, 3, 5, 7
5:   O95670   O95433   O75663            O95433 2, 3, 4, 6, 7, 8       1, 3, 5, 7       1, 3, 5, 7                        1, 3, 5, 7
6:   O95801            O95400            O95801             4, 8                        1, 3, 5, 7                              4, 8
7:                     O95670            O95670                                   2, 3, 4, 6, 7, 8                  2, 3, 4, 6, 7, 8
8:                                       O95456                                                                     2, 3, 4, 6, 7, 8

Used data extended example:

strs <- structure(list(strings1 = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 1L, 1L), .Label = c("", "O75663", "O95400", "O95433", "O95456", "O95670", "O95801"), class = "factor"), 
                       strings2 = structure(c(4L, 2L, 6L, 5L, 3L, 1L, 1L, 1L), .Label = c("", "O75663", "O95433", "O95456", "P00352", "P00492"), class = "factor"), 
                       strings3 = structure(c(4L, 6L, 7L, 8L, 2L, 3L, 5L, 1L), .Label = c("", "O75663", "O95400", "O95456", "O95670", "O95801", "P00352", "P00492"), class = "factor"), 
                       strings4 = structure(c(2L, 5L, 3L, 4L, 1L, 1L, 1L, 1L), .Label = c("", "O95400", "O95456", "O95801", "P00492"), class = "factor"), 
                       strings5 = structure(c(8L, 2L, 7L, 1L, 3L, 6L, 5L, 4L), .Label = c("O75663", "O95400", "O95433", "O95456", "O95670", "O95801", "P00352", "P00492"), class = "factor")), 
                  .Names = c("strings1", "strings2", "strings3", "strings4", "strings5"), class = "data.frame", row.names = c(NA, -8L))

lut <- structure(list(V1 = c("O75663", "O95400", "O95433", NA, NA), 
                      V2 = c("O95456", "O95670", NA, NA, NA), 
                      V3 = c("O75663", "O95400", "O95433", "O95456", "O95670"), 
                      V4 = c("O95456", "O95670", "O95801", "P00352", NA), 
                      V5 = c("O75663", "O95400", "O95433", NA, NA), 
                      V6 = c("O95456", "O95670", NA, NA, NA), 
                      V7 = c("O75663", "O95400", "O95433", "O95456", "O95670"), 
                      V8 = c("O95456", "O95670", "O95801", "P00352", NA)), 
                 .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8"), row.names = c(NA, -5L), class = "data.frame")

Categories

r - Finding the index based on two data frames of strings

Please log in or register to add a comment.

Please log in or register to answer this question.

1 Answer

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags