Manipulating Data from the Web

Reading text content from the web
Read tables from the web
Read tables from a https connection
Read Google spreasheet files
Get a wikipedia HTML table
Parsing XML
Parsing JSON
Making R a web client with RCurl

Refs:

http://gastonsanchez.com/blog/resources/2014/05/12/Web-data.html

Reading text content from the web

# open a connection to the file with the book Moby Dick at Gutenberg's
moby_url = url("http://www.gutenberg.org/ebooks/2701.txt.utf-8")
moby_url

##                                      description 
## "http://www.gutenberg.org/ebooks/2701.txt.utf-8" 
##                                            class 
##                                            "url" 
##                                             mode 
##                                              "r" 
##                                             text 
##                                           "text" 
##                                           opened 
##                                         "closed" 
##                                         can read 
##                                            "yes" 
##                                        can write 
##                                             "no"

# read the first 500 lines
moby_dick = readLines(moby_url, n = 500)
length(moby_dick)

## [1] 500

head(moby_dick, 12) # each line is an element of a char vector

##  [1] "ï»¿The Project Gutenberg EBook of Moby Dick; or The Whale, by Herman Melville"
##  [2] ""                                                                             
##  [3] "This eBook is for the use of anyone anywhere at no cost and with"             
##  [4] "almost no restrictions whatsoever.  You may copy it, give it away or"         
##  [5] "re-use it under the terms of the Project Gutenberg License included"          
##  [6] "with this eBook or online at www.gutenberg.org"                               
##  [7] ""                                                                             
##  [8] ""                                                                             
##  [9] "Title: Moby Dick; or The Whale"                                               
## [10] ""                                                                             
## [11] "Author: Herman Melville"                                                      
## [12] ""

Another option is to use download.file() to prevent overloading the site’s server.

download.file("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", "mobydick.txt")
moby_dick <- readLines("mobydick.txt", n=500) # read the 1st 500 lines
length(moby_dick)

## [1] 500

head(moby_dick, 12) # each line is again an element of a char vector

##  [1] "ï»¿The Project Gutenberg EBook of Moby Dick; or The Whale, by Herman Melville"
##  [2] ""                                                                             
##  [3] "This eBook is for the use of anyone anywhere at no cost and with"             
##  [4] "almost no restrictions whatsoever.  You may copy it, give it away or"         
##  [5] "re-use it under the terms of the Project Gutenberg License included"          
##  [6] "with this eBook or online at www.gutenberg.org"                               
##  [7] ""                                                                             
##  [8] ""                                                                             
##  [9] "Title: Moby Dick; or The Whale"                                               
## [10] ""                                                                             
## [11] "Author: Herman Melville"                                                      
## [12] ""

The romance starts at line 536. Let’s read the first 10 lines using scan:

n.lines <- 10
moby_dick_chap1 <- rep(NA, n.lines)
skip <- 535
# reading 10 lines (line-by-line using scan)
for (i in 1L:n.lines) {
  one_line = scan("mobydick.txt", what = "", skip = skip, nlines = 1)
  moby_dick_chap1[i] = paste(one_line, collapse = " ")
  skip = skip + 1
}
moby_dick_chap1

##  [1] "CHAPTER 1. Loomings."                                                    
##  [2] ""                                                                        
##  [3] ""                                                                        
##  [4] "Call me Ishmael. Some years ago--never mind how long precisely--having"  
##  [5] "little or no money in my purse, and nothing particular to interest me on"
##  [6] "shore, I thought I would sail about a little and see the watery part of" 
##  [7] "the world. It is a way I have of driving off the spleen and regulating"  
##  [8] "the circulation. Whenever I find myself growing grim about the mouth;"   
##  [9] "whenever it is a damp, drizzly November in my soul; whenever I find"     
## [10] "myself involuntarily pausing before coffin warehouses, and bringing up"

We can also read html content:

skulls = readLines("http://lib.stat.cmu.edu/DASL/Datafiles/EgyptianSkulls.html")
head(skulls, 12)

##  [1] "<TITLE>Egyptian Skulls Datafile</TITLE>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [2] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [3] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [4] "<hr size=2><center><table border=1 cellpadding=0 cellspacing=0><tr><td align=center><table border=1 cellpadding=1 cellspacing=0><tr><td><A HREF=\"../DataArchive.html\"><IMG SRC=\"../InlineImages/mainmenu.gif\" alt=\"Go to Main Menu\"></a></td></tr></table></td><td align=center><table border=1 cellpadding=1 cellspacing=0><tr><td><A HREF=\"/cgi-bin/dasl.cgi\"><IMG SRC=\"../InlineImages/powersearchsmall.gif\" alt=\"Go to Power Search\"></a></td></tr></table></td><td align=center><table border=1 cellpadding=1 cellspacing=0><tr><td><A HREF=\"../allsubjects.html\"><IMG SRC=\"../InlineImages/allsubjects.gif\" alt=\"Go to Datafile Subjects\"></a></td></tr></table></td></tr></table></center><hr size=2>"
##  [5] "<B><DT>Datafile Name:</B>   Egyptian Skulls"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
##  [6] "<B><DT>Datafile Subjects:</B>   <dsubjects>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
##  [7] "<A HREF=\"/cgi-bin/dasl.cgi?query=Archeology&submit=Search&metaname=dsubjects&sort=swishrank\">Archeology</A>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##  [8] ", "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [9] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [10] "<A HREF=\"/cgi-bin/dasl.cgi?query=Biology&submit=Search&metaname=dsubjects&sort=swishrank\">Biology</A>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## [11] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [12] "</dsubjects>"

Read tables from the web

taxon_url = "http://www.bio.ic.ac.uk/research/mjcraw/therbook/data/taxon.txt"
taxon = read.table(taxon_url, header = TRUE, row.names = 1)
head(taxon)

##   Petals Internode Sepal Bract Petiole  Leaf Fruit
## 1  5.621     29.48 2.462 18.20   11.28 1.128 7.876
## 2  4.995     28.36 2.429 17.65   11.04 1.198 7.025
## 3  4.768     27.25 2.570 19.41   10.49 1.004 7.817
## 4  6.299     25.92 2.066 18.38   11.80 1.614 7.672
## 5  6.489     25.21 2.902 17.31   10.12 1.813 7.758
## 6  5.786     25.52 2.656 17.07   10.56 1.956 7.881

Read tables from a https connection

library(RCurl)

## Loading required package: bitops

iris_file = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
iris_url  = getURLContent(iris_file, ssl.verifypeer = FALSE) # use when getURL(iris_file) fails
iris_data = read.csv(textConnection(iris_url), header = FALSE)
names(iris_data) <- c("SL", "SW", "PL", "PW", "Species")
head(iris_data)

##    SL  SW  PL  PW     Species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

Read Google spreasheet files

The doc’s public key is needed:

library(RCurl)

google_docs = "https://docs.google.com/spreadsheet/"
cars_key = "pub?key=0AjoVnZ9iB261dHRfQlVuWDRUSHdZQ1A4N294TEstc0E&output=csv" # public key of data 'cars'
cars_csv = getURLContent(paste(google_docs, cars_key, sep = ""), ssl.verifypeer = FALSE)
cars2004 = read.csv(textConnection(cars_csv), row.names = 1, header = TRUE) # import data in R (through a text connection)
head(cars2004)

##                     Cylinders Horsepower Speed Weight Width Length
## Citroen C2               1124         61   158    932  1659   3666
## Smart Fortwo              698         52   135    730  1515   2500
## Mini 1.6 170             1598        170   218   1215  1690   3625
## Nissan Micra 1.2         1240         65   154    965  1660   3715
## Renault Clio 3.0 V6      2946        255   245   1400  1810   3812
## Audi A3 1.9              1896        105   187   1295  1765   4203

To read excel .xsl and .xlsx files check this post. To read .csv files use function read.csv.

Get a wikipedia HTML table

library(XML)

## Warning: package 'XML' was built under R version 3.1.1

swim_wiki = "http://en.wikipedia.org/wiki/World_record_progression_1500_metres_freestyle"
swim1500 = readHTMLTable(swim_wiki, which = 1, stringsAsFactors = FALSE) # read 1st table from webpage
head(swim1500)

##   #    Time                             Name   Nationality
## 1 1 22:48.4      Taylor , Henry Henry Taylor Great Britain
## 2 2 22:00.0  Hodgson , George George Hodgson        Canada
## 3 3 21:35.3            Borg , Arne Arne Borg        Sweden
## 4 4 21:15.0            Borg , Arne Arne Borg        Sweden
## 5 5 21:11.4            Borg , Arne Arne Borg        Sweden
## 6 6 20:06.6      Charlton , Boy Boy Charlton     Australia
##                           Date          Meet
## 1 01908-07-25-0000Jul 25, 1908 Olympic Games
## 2 01912-07-10-0000Jul 10, 1912 Olympic Games
## 3  01923-07-08-0000Jul 8, 1923             -
## 4 01924-01-30-0000Jan 30, 1924             -
## 5 01924-07-13-0000Jul 13, 1924             -
## 6 01924-07-15-0000Jul 15, 1924 Olympic Games
##                                           Location Ref
## 1 United Kingdom, London !  London, United Kingdom    
## 2           Sweden, Stockholm !  Stockholm, Sweden    
## 3         Sweden, Gothenburg !  Gothenburg, Sweden    
## 4           Australia, Sydney !  Sydney, Australia    
## 5                   France, Paris !  Paris, France    
## 6                   France, Paris !  Paris, France

Parsing XML

The main function is xmlParse() which is a DOM parser, ie, a parser that reads the XML document into a tree structure.

library(XML)
plant = "http://www.xmlfiles.com/examples/plant_catalog.xml"
doc1 = xmlParse(plant)     # Parse as a C structure
doc2 = xmlTreeParse(plant) # Parse as a R structure (this function is a xmlParse wrapper)
class(doc2) # class "XMLDocument" is implemented as a hierarchy of lists

## [1] "XMLDocument"         "XMLAbstractDocument"

To parse HTML:

doc3 = htmlTreeParse("http://www.r-project.org/mail.html") # parse into a R structure
class(doc3)

## [1] "XMLDocumentContent"

After parsing, we need to be able to access its internal information

xmlRoot get access to the root node
xmlChildren get access to the child element of a given node

To access the internal structure of a node

xmlName() name of the node
xmlSize() number of subnodes
xmlAttrs() named character vector of all attributes
xmlGetAttr() value of a single attribute
xmlValue() contents of a leaf node
xmlParent() name of parent node
xmlAncestors() name of ancestor nodes
getSibling() siblings to the right or to the left
xmlNamespace() the namespace (if there’s one)

Some egs:

xmlName( xmlRoot(doc2) ) # the name of the root node

## [1] "CATALOG"

xmlSize( xmlChildren( xmlRoot(doc2) ) ) # how many children that it have

## [1] 36

xmlChildren( xmlRoot(doc2) )[[1]]       # the 1st children

## <PLANT>
##  <COMMON>Bloodroot</COMMON>
##  <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
##  <ZONE>4</ZONE>
##  <LIGHT>Mostly Shady</LIGHT>
##  <PRICE>$2.44</PRICE>
##  <AVAILABILITY>031599</AVAILABILITY>
## </PLANT>

xmlChildren( xmlChildren( xmlRoot(doc2) )[[1]] )[[2]]

## <BOTANICAL>Sanguinaria canadensis</BOTANICAL>

xmlValue( xmlChildren( xmlChildren( xmlRoot(doc2) )[[1]] )[[2]] )

## [1] "Sanguinaria canadensis"

A simpler eg:

xml_string = c(
'<?xml version="1.0" encoding="UTF-8"?>',
'<movies>',
    '<movie mins="126" lang="eng">',
        '<title>Good Will Hunting</title>',
        '<director>',
        '<first_name>Gus</first_name>',
        '<last_name>Van Sant</last_name>',
        '</director>',
        '<year>1998</year>',
        '<genre>drama</genre>',
    '</movie>',

    '<movie mins="106" lang="spa">',
        '<title>Y tu mama tambien</title>',
        '<director>',
        '<first_name>Alfonso</first_name>',
        '<last_name>Cuaron</last_name>',
        '</director>',
        '<year>2001</year>',
        '<genre>drama</genre>',
    '</movie>',
'</movies>')

# parse xml content
movies_xml = xmlParse(xml_string, asText = TRUE)
root = xmlRoot(movies_xml)
movie_child = xmlChildren(root)
goodwill = movie_child[[1]]
goodwill

## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie>

xmlName(goodwill)

## [1] "movie"

xmlSize(goodwill)

## [1] 4

xmlAttrs(goodwill)

##  mins  lang 
## "126" "eng"

xmlGetAttr(goodwill, name = 'lang')

## [1] "eng"

xmlValue(goodwill) # node content (as character string)

## [1] "Good Will HuntingGusVan Sant1998drama"

xmlChildren(goodwill)

## $title
## <title>Good Will Hunting</title> 
## 
## $director
## <director>
##   <first_name>Gus</first_name>
##   <last_name>Van Sant</last_name>
## </director> 
## 
## $year
## <year>1998</year> 
## 
## $genre
## <genre>drama</genre> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"

gusvan = xmlChildren(goodwill)[[2]]
gusvan

## <director>
##   <first_name>Gus</first_name>
##   <last_name>Van Sant</last_name>
## </director>

xmlParent(gusvan)

## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie>

xmlChildren(gusvan)

## $first_name
## <first_name>Gus</first_name> 
## 
## $last_name
## <last_name>Van Sant</last_name> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"

getSibling(goodwill)

## <movie mins="106" lang="spa">
##   <title>Y tu mama tambien</title>
##   <director>
##     <first_name>Alfonso</first_name>
##     <last_name>Cuaron</last_name>
##   </director>
##   <year>2001</year>
##   <genre>drama</genre>
## </movie>

We can iterate and apply functions to certain nodes:

movie_child

## $movie
## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie> 
## 
## $movie
## <movie mins="106" lang="spa">
##   <title>Y tu mama tambien</title>
##   <director>
##     <first_name>Alfonso</first_name>
##     <last_name>Cuaron</last_name>
##   </director>
##   <year>2001</year>
##   <genre>drama</genre>
## </movie> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"

sapply(movie_child, xmlAttrs)

##      movie movie
## mins "126" "106"
## lang "eng" "spa"

sapply(movie_child, function(nd) xmlValue(xmlChildren(nd)$title) )

##               movie               movie 
## "Good Will Hunting" "Y tu mama tambien"

xmlSApply(root, xmlAttrs) # sapply wrapper that operate on the sub-nodes of the given node

##      movie movie
## mins "126" "106"
## lang "eng" "spa"

xmlSApply(root, function(nd) xmlValue(xmlChildren(nd)$title) )

##               movie               movie 
## "Good Will Hunting" "Y tu mama tambien"

A better way to explore the xml tree is to query for certain nodes. This is possible through XPath.

XPath has a syntax that must be known to explore its power.

For instance, /movies/movie[1] means the first movie that is child to the movies element:

getNodeSet(movies_xml, "/movies/movie[1]")

## [[1]]
## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

The main parts of XPath syntax:

/ selects from the root node
// selects nodes anywhere
. selects the current node
.. selects the parent of the current node
@ selects attributes
[] square brackets to indicate attributes
* matches any element node
@* matches any attribute node
node() matches any node of any kind

getNodeSet(movies_xml, "/movies/movie")

## [[1]]
## <movie mins="126" lang="eng">
##   <title>Good Will Hunting</title>
##   <director>
##     <first_name>Gus</first_name>
##     <last_name>Van Sant</last_name>
##   </director>
##   <year>1998</year>
##   <genre>drama</genre>
## </movie> 
## 
## [[2]]
## <movie mins="106" lang="spa">
##   <title>Y tu mama tambien</title>
##   <director>
##     <first_name>Alfonso</first_name>
##     <last_name>Cuaron</last_name>
##   </director>
##   <year>2001</year>
##   <genre>drama</genre>
## </movie> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "/movies/movie[1]/title")

## [[1]]
## <title>Good Will Hunting</title> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "/movies/movie/director/first_name")

## [[1]]
## <first_name>Gus</first_name> 
## 
## [[2]]
## <first_name>Alfonso</first_name> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "//last_name")

## [[1]]
## <last_name>Van Sant</last_name> 
## 
## [[2]]
## <last_name>Cuaron</last_name> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "/movies/movie[@lang='spa']/title")

## [[1]]
## <title>Y tu mama tambien</title> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "/movies/movie[@mins>120]/title")

## [[1]]
## <title>Good Will Hunting</title> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "/movies/movie[@mins>120]/*")

## [[1]]
## <title>Good Will Hunting</title> 
## 
## [[2]]
## <director>
##   <first_name>Gus</first_name>
##   <last_name>Van Sant</last_name>
## </director> 
## 
## [[3]]
## <year>1998</year> 
## 
## [[4]]
## <genre>drama</genre> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

getNodeSet(movies_xml, "/movies/movie[@mins>120]/@*")

## [[1]]
##  mins 
## "126" 
## attr(,"class")
## [1] "XMLAttributeValue"
## 
## [[2]]
##  lang 
## "eng" 
## attr(,"class")
## [1] "XMLAttributeValue"
## 
## attr(,"class")
## [1] "XMLNodeSet"

Parsing JSON

JSON has the following data types: null, true, false, number, string, lists (using []) and dictionaries (using {}). Eg:

{ “name”: [“X”, “Y”, “Z”], “grams”: [300, 200, 500], “qty”: [4, 5, null], “new”: [true, false, true], }

library(RJSONIO)

## Warning: package 'RJSONIO' was built under R version 3.1.1

RJSONIO package has two main functions:

toJSON converts an R object to a string in JSON
fromJSON converts JSON content to R objects

swdf = as.data.frame(rbind(
  c("Anakin", "male", "Tatooine", "41.9BBY", "yes"),
  c("Amidala", "female", "Naboo", "46BBY", "no"),
  c("Luke", "male", "Tatooine", "19BBY", "yes"),
  c("Leia", "female", "Alderaan", "19BBY", "no"),
  c("Obi-Wan", "male", "Stewjon", "57BBY", "yes"),
  c("Han", "male", "Corellia", "29BBY", "no"),
  c("Palpatine", "male", "Naboo", "82BBY", "no"),
  c("R2-D2", "unknown", "Naboo", "33BBY", "no")
))
names(swdf) = c("Name", "Gender", "Homeworld", "Born", "Jedi")
swdf

##        Name  Gender Homeworld    Born Jedi
## 1    Anakin    male  Tatooine 41.9BBY  yes
## 2   Amidala  female     Naboo   46BBY   no
## 3      Luke    male  Tatooine   19BBY  yes
## 4      Leia  female  Alderaan   19BBY   no
## 5   Obi-Wan    male   Stewjon   57BBY  yes
## 6       Han    male  Corellia   29BBY   no
## 7 Palpatine    male     Naboo   82BBY   no
## 8     R2-D2 unknown     Naboo   33BBY   no

sw_json = toJSON(swdf) # convert R data.frame to JSON
cat(sw_json)

## {
##  "Name": [ "Anakin", "Amidala", "Luke", "Leia", "Obi-Wan", "Han", "Palpatine", "R2-D2" ],
## "Gender": [ "male", "female", "male", "female", "male", "male", "male", "unknown" ],
## "Homeworld": [ "Tatooine", "Naboo", "Tatooine", "Alderaan", "Stewjon", "Corellia", "Naboo", "Naboo" ],
## "Born": [ "41.9BBY", "46BBY", "19BBY", "19BBY", "57BBY", "29BBY", "82BBY", "33BBY" ],
## "Jedi": [ "yes", "no", "yes", "no", "yes", "no", "no", "no" ] 
## }

sw_R = fromJSON(sw_json) # convert JSON string to R list
sw_R

## $Name
## [1] "Anakin"    "Amidala"   "Luke"      "Leia"      "Obi-Wan"   "Han"      
## [7] "Palpatine" "R2-D2"    
## 
## $Gender
## [1] "male"    "female"  "male"    "female"  "male"    "male"    "male"   
## [8] "unknown"
## 
## $Homeworld
## [1] "Tatooine" "Naboo"    "Tatooine" "Alderaan" "Stewjon"  "Corellia"
## [7] "Naboo"    "Naboo"   
## 
## $Born
## [1] "41.9BBY" "46BBY"   "19BBY"   "19BBY"   "57BBY"   "29BBY"   "82BBY"  
## [8] "33BBY"  
## 
## $Jedi
## [1] "yes" "no"  "yes" "no"  "yes" "no"  "no"  "no"

Let’s import some JSON info from the net and clean it:

miser = "http://mbostock.github.io/protovis/ex/miserables.js"
miserables = readLines(miser)
miserables = miserables[-c(1:11)]                   # eliminate first 11 lines (containing comments)
miserables[1]                  = "{"                # open curly bracket in first line
miserables[length(miserables)] = "}"                # closing curly bracket in last line
miserables_str = paste(miserables, collapse = "")   # JSON content in one single string
substr(miserables_str,1,120)

## [1] "{  nodes:[    {nodeName:\"Myriel\", group:1},    {nodeName:\"Napoleon\", group:1},    {nodeName:\"Mlle. Baptistine\", group:1}"

mis1 = fromJSON(miserables_str)
lapply(mis1, length)

## $ode
## [1] 77
## 
## $ink
## [1] 254

head( mis1[[1]], 3)

## [[1]]
## [[1]]$odeNam
## [1] "Myriel"
## 
## [[1]]$rou
## [1] 1
## 
## 
## [[2]]
## [[2]]$odeNam
## [1] "Napoleon"
## 
## [[2]]$rou
## [1] 1
## 
## 
## [[3]]
## [[3]]$odeNam
## [1] "Mlle. Baptistine"
## 
## [[3]]$rou
## [1] 1

head( mis1[[2]], 3)

## [[1]]
## ourc arge  alu 
##    1    0    1 
## 
## [[2]]
## ourc arge  alu 
##    2    0    8 
## 
## [[3]]
## ourc arge  alu 
##    3    0   10

# eg, translating the components into clean dataframes
names <- as.data.frame(mis1[[1]])
names <- data.frame(codeName=t(names[,c(T,F)]), group=t(names[,c(F,T)]))
rownames(names) <- 1:nrow(names)
head(names,10)

##            codeName group
## 1            Myriel     1
## 2          Napoleon     1
## 3  Mlle. Baptistine     1
## 4     Mme. Magloire     1
## 5    Countess de Lo     1
## 6          Geborand     1
## 7      Champtercier     1
## 8          Cravatte     1
## 9             Count     1
## 10          Old Man     1

links <- t(as.data.frame(mis1[[2]]))
rownames(links) <- 1:nrow(links)
colnames(links) <- c("source","target","value")
head(links,10)

##    source target value
## 1       1      0     1
## 2       2      0     8
## 3       3      0    10
## 4       3      2     6
## 5       4      0     1
## 6       5      0     1
## 7       6      0     1
## 8       7      0     1
## 9       8      0     2
## 10      9      0     1

Making R a web client with RCurl

RCurl provides an R interface to client-side HTTP.

It allows us to: + download URLs + submit forms in different ways + supports HTTPS (the secure HTTP) + handle authentication using passwords + use FTP to download files + use persistent connections + upload files + handle escaping characters in requests + handle binary data

Three main functions

getURL() fetches the content of a URI
getForm() submits a Web form via the GET method
postForm() submits a Web form via the POST method

library(RCurl)
library(XML)

rproj = getURL("http://www.r-project.org/")
rproj

## [1] "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">\n<html>\n<head>\n<title>The R Project for Statistical Computing</title>\n<link rel=\"icon\" href=\"favicon.ico\" type=\"image/x-icon\">\n<link rel=\"shortcut icon\" href=\"favicon.ico\" type=\"image/x-icon\">\n<link rel=\"stylesheet\" type=\"text/css\" href=\"R.css\">\n</head>\n\n<FRAMESET cols=\"1*, 4*\">\n<FRAMESET rows=\"120, 1*\">\n<FRAME src=\"logo.html\" name=\"logo\" frameborder=0>\n<FRAME src=\"navbar.html\" name=\"contents\" frameborder=0>\n</FRAMESET>\n<FRAME src=\"main.shtml\" name=\"banner\" frameborder=0>\n<noframes>\n<h1>The R Project for Statistical Computing</h1>\n\nYour browser seems not to support frames,\nhere is the <A href=\"navbar.html\">contents page</A> of the R Project's\nwebsite.\n</noframes>\n</FRAMESET>\n\n\n\n"

rproj_doc = htmlParse(rproj)
rproj_doc # we can parse this html using XML functions, and already seen

## <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
## <html>
## <head>
## <title>The R Project for Statistical Computing</title>
## <link rel="icon" href="favicon.ico" type="image/x-icon">
## <link rel="shortcut icon" href="favicon.ico" type="image/x-icon">
## <link rel="stylesheet" type="text/css" href="R.css">
## </head>
## <frameset cols="1*, 4*">
## <frameset rows="120, 1*">
## <frame src="logo.html" name="logo" frameborder="0">
## <frame src="navbar.html" name="contents" frameborder="0">
## </frameset>
## <frame src="main.shtml" name="banner" frameborder="0">
## <noframes>
## <body>
## <h1>The R Project for Statistical Computing</h1>
## 
## Your browser seems not to support frames,
## here is the <a href="navbar.html">contents page</a> of the R Project's
## website.
## 
## </body>
## </noframes>
## </frameset>
## </html>
##

RCurl has the capacity for making requests associated to Web Forms

google.request <- getForm("http://www.google.com/search", hl="en", lr="", ie="ISO-8859-1",  q="RCurl", btnG="Search")
google_doc = htmlParse(google.request)
# get only the <a> nodes and show their 'href' attributes
head( sapply( getNodeSet(google_doc,"//a"), function(nd) xmlGetAttr(nd, name = 'href')  ), 15)

##  [1] "https://www.google.com/webhp?tab=ww"                                                              
##  [2] "http://www.google.com/search?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&tbm=isch&source=og&sa=N&tab=wi"      
##  [3] "http://maps.google.com/maps?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=wl"                          
##  [4] "https://play.google.com/?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=w8"                             
##  [5] "http://www.youtube.com/results?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=w1"                       
##  [6] "http://news.google.com/nwshp?hl=en&tab=wn"                                                        
##  [7] "https://mail.google.com/mail/?tab=wm"                                                             
##  [8] "https://drive.google.com/?tab=wo"                                                                 
##  [9] "http://www.google.com/intl/en/options/"                                                           
## [10] "https://www.google.com/calendar?tab=wc"                                                           
## [11] "http://translate.google.com/?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=wT"                         
## [12] "http://www.google.com/mobile/?hl=en&tab=wD"                                                       
## [13] "http://www.google.com/search?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&tbo=u&tbm=bks&source=og&sa=N&tab=wp" 
## [14] "https://wallet.google.com/manage/?tab=wa"                                                         
## [15] "http://www.google.com/search?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&tbo=u&tbm=shop&source=og&sa=N&tab=wf"

# or, more simply:
head( getHTMLLinks(google_doc), 15)

##  [1] "https://www.google.com/webhp?tab=ww"                                                              
##  [2] "http://www.google.com/search?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&tbm=isch&source=og&sa=N&tab=wi"      
##  [3] "http://maps.google.com/maps?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=wl"                          
##  [4] "https://play.google.com/?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=w8"                             
##  [5] "http://www.youtube.com/results?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=w1"                       
##  [6] "http://news.google.com/nwshp?hl=en&tab=wn"                                                        
##  [7] "https://mail.google.com/mail/?tab=wm"                                                             
##  [8] "https://drive.google.com/?tab=wo"                                                                 
##  [9] "http://www.google.com/intl/en/options/"                                                           
## [10] "https://www.google.com/calendar?tab=wc"                                                           
## [11] "http://translate.google.com/?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&sa=N&tab=wT"                         
## [12] "http://www.google.com/mobile/?hl=en&tab=wD"                                                       
## [13] "http://www.google.com/search?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&tbo=u&tbm=bks&source=og&sa=N&tab=wp" 
## [14] "https://wallet.google.com/manage/?tab=wa"                                                         
## [15] "http://www.google.com/search?hl=en&lr=&q=RCurl&um=1&ie=UTF-8&tbo=u&tbm=shop&source=og&sa=N&tab=wf"

There is the RHTMLForms package (here) that can be used with RCurl to ease the way to handle forms.