# This is commenting for pythonscrape.py by Tom Roderick, 4/2/2013. # Another version of this file goes by the name bigdatapython.py; it only downloads pdf files from the main UT CompEco course website, it is slightly simpler to follow. import time import urllib2 # The above imports libraries that we will use. Libraries are prepackaged functions for use. The two libraries we have here are ''standard libraries'' -- they (should) come in every Python distribution. baseurl = 'http://laits.utexas.edu/compeco/Courses/' # Here we create a string called baseurl. A string is a variable consisting of characters. We will use our string as a portion of a url, hence the reason we called it baseurl page = urllib2.urlopen(baseurl+'index392.html') # page is an "instance" of the urllib2.urlopen function. There is not a whole lote we can do to it by itself. Note that to append strings one just uses +: basurl is a string, as is 'index392.html'. information = page.read() # This takes the data from page and reads it into one long string. From here we will work with this data directly to find links to files. information_links = information.split('href="')[1:] # This takes the long string information and turns it into a list by splitting wherever 'href=' is observed. We only need the second to the last of these occurrence; splitting by href means the first is all the code before the first link, but no links therein. A list is similar to a vector or an array but can contain any kind of object. Note that all indicies in Python being with 0--the first element is found by information_links[0]. Also note that there will be a lot of extra characters along with the links we want. list_of_links = [t[:t.find('"')] for t in information_links] # Now we create a list of only links. Since we split wherever 'ahref="' was seen, the second quotation market for a link is the next natural place to split. What should be left are links. (However, we may have false positives since we're interested in files, not other webpages. We deal with these as we download.) The structure we used to create this new list is called a "list comprehension" -- essentially a different way to use a for loop. Had we used a for loop directly, we would have the following snippet (can use if uncomment): # list_of_links = [] #create an empty list # for t in information_links: #begin for loop # list_of_links.append(t[:t.find('"')]) #append the new item to a string list_of_links = list(set(list_of_links)) # This is a nifty python trick to remove duplicates, take a list, convert to a set, reconvert to list. for x in list_of_links: qq = list_of_links.index(x) # This gives us our number for the index. extension = x[::-1][:x[::-1].find('.')][::-1] # This is a bit of sorcery. Let's take it a step at a time: # extension is a new variable name # x[::-1] takes the string named 'x' from list_of_links (see line 30) and reverses it. The -1 in [::-1] refers to step size. For more information, look into python documentation for slicing #With the reversed string x[::-1], we only want to find the first instance of the period '.', which indicates where an extension should begin. This is not very robust code (it will be an empty string '' if there is no period in it) # Now we have the extension name, but it's reversed. So we reverse it again; hence we take the reversed extension x[::-1][:x[::-1].find('.')] and reverse with [::-1]. # Slicing is fun but can get complex quick! Be sure to document what you do! name = x[::-1][:x[::-1].find('/')][::-1] # We do a similar slicing here as that of extension. The assumption is that for this page the name begins after '/' (as extension should begin after '.') print 'now working on file '+str(qq+1)+' of '+str(len(list_of_links)) # Status updates are nice to error check. if len(extension) > 5: # Here we check to ensure we don't have a false positive for a file. Most extension names are less than 5 characters. This can, of course, be changed to adapt to the webpage print 'Warning! Possible false positive for file match. File extension reported as' print extension+' and is longer than 5 characters. Likely not a file; skipping.' continue # continue means we leave this iteration of the for loop. If this statement is reached (meaning the length of extension was larger than 5) we do not execute any code below time.sleep(7) # This is an attempt at being a good citizen of the Internet. We don't want to cause servers to crash! Hitting a server intentionally with too many requests is called a Denial-of-Service (DOS) attack. The more you know! file_to_write_name = str(qq+1)+'.'+extension # This creates a string for the file name we will use. if x[0:4] == 'http': try: newpage = urllib2.urlopen(x.replace(' ','%20')) except urllib2.HTTPError, e: print e print 'File '+name+' not found. Continuing to next file.' continue #The try, except code is to catch possible errors. Here, if a file doesn't exist, when urllib2.urlopen attempts to open it then there will be an HTTP 404 error (same as what you may have seen in a web browser). This code attempts to open, acknowledges if there is an error, and moves on if there is. else: try: newpage = urllib2.urlopen(baseurl+x.replace(' ','%20')) except urllib2.HTTPError, e: print e print 'File '+name+' not found. Continuing to next file.' continue # The if-statement here addresses the fact that some of the links are internal (e.g. 'growth.xls' and some are external 'http://www.eco.utexas.edu/compeco/Courses/growth.xls', but to download you need a full URL. try: name = x[::-1][:x[::-1].find('/')][::-1] localFile = open(name,'w') # This will download to the local python folder. To change, append a string like "C:\\" before name, i.e. localFile = open('C:\\'+name,'w') # Macintosh users will need to write to '/home/ localFile.write(newpage.read()) localFile.close() newpage.close() except IOError: print 'Warning! File '+str(qq+1)+" failed to write; possibly link is to a web page. \nTried to write to "+file_to_write_name+'.' # If our previous file check with extension names didn't work, this catches things that are likely webpages.