import sys import flickrapi from operator import itemgetter import urllib from xml.dom import minidom import re import nltk from nltk_contrib.readability import textanalyzer import string # This script generates a haiku about a particular location based on the # titles of nearby flickr photographs. # Sample comand line use: # > python geo_haiku.py "721 broadway, new york ny" # Optionally, pass the number of Haikus you want as a second argument: # > python geo_haiku.py "721 broadway, new york ny" 3 # Set to true to print extra garbage. debug = 0 # HELPER FUNCTIONS. # Phonetic syllable counter via nltk_contrib. # Dictionary-based approaches don't work relibably with the weird words of Flickr. t = textanalyzer.textanalyzer('eng') def count_syllables(word): temp_list = list() temp_list.append(word) count = t.countSyllables(temp_list) if count == 0: count = 1 return count def strip_punctuation(s): return s.translate(string.maketrans("",""), string.punctuation) def has_vowel(s): return re.search(r'[aeiou]+', s) def is_picture(s): return re.search(r'jpg$|gif$|png$|jpeg$', s) # GEOCODE THE ADDRESS. # Grab a plain-text address from the command line. address = sys.argv[1] google_maps_api_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' geo_url = 'http://maps.google.com/maps/geo?key=' + google_maps_api_key + '&output=xml&q=' + address geocode = urllib.urlopen(geo_url) geocode_xml = minidom.parse(geocode) lon_lat = geocode_xml.getElementsByTagName('coordinates')[0].childNodes[0].nodeValue.split(',') geocode.close() latitude = lon_lat[1] longitude = lon_lat[0] if debug: print address + ' is at lat: ' + latitude + ' lon: ' + longitude # FIND THE TAGS. # Get the Flickr lib here: http://stuvel.eu/projects/flickrapi flickr_api_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' flickr = flickrapi.FlickrAPI(flickr_api_key, format='etree') tags = dict() tags_wanted = 150 tags_found = 0 search_radius = 1 current_page = 1 while tags_found < tags_wanted: # Search Flickr for images near the address given. # Moar info on thie API call: http://www.flickr.com/services/api/flickr.photos.search.html geo_photos = flickr.photos_search(page = str(current_page), min_taken_date = '1900-01-01 00:00:00', lat = str(latitude), lon = str(longitude), accuracy = '16', radius = str(search_radius), per_page = '250', extras = 'tags') total_pages = int(geo_photos[0].attrib['pages']) if debug: print 'reading page ' + str(current_page) + ' of ' + str(total_pages) for photo in geo_photos.getiterator('photo'): # Originally planned on using tags, seems to give more interesting results # with photo titles instead. photo_tags = photo.attrib['title'].encode('ascii', 'ignore').split(' ') #photo_tags = photo.attrib['tags'].encode('ascii', 'ignore').split(' ') for tag in photo_tags: tag = strip_punctuation(tag.lower()) # Add to the dictionary, otherwise increment the frequency count. # Make sure the tag exists, and don't let numbers through (they're boring), # and that is has a vowel, and that it doesn't look like a meta-reference to a filename. if (len(tag) >= 1) and not re.search(r'\d+', tag) and has_vowel(tag) and not is_picture(tag): if tag not in tags: tags[tag] = 1 else: tags[tag] += 1 # To get truly regional, need to subtract wider-area tags from a smaller search radius? # Maybe next iteration. # Update tags found. tags_found = len(tags) if debug: print 'Found ' + str(tags_found) + ' unique tags in a ' + str(search_radius) + ' mile radius' # If we didn't get everything we wanted, then we'll have to go to the next # page of Flickr result, or increase the search radius for the next round. # I found that I have to set a hard limit around 20 pages per search radius, # otherwise it's too tough to find unique tags. if (current_page >= total_pages) or (current_page >= 20): # Reset current page. current_page = 1 # Bump the search radius. if debug: print 'Searching a larger area...' search_radius += 1 else: # Search the next page. if debug: print 'Going to the next page' current_page += 1 # Perhaps a cleaner way to sort dictionaries? # http://code.activestate.com/recipes/304440/ if debug: # Print each word found, and its frequency. print '{' for tag in sorted(tags.items(), key = itemgetter(1), reverse = True): print '"' + tag[0] + '": '+ str(tag[1]) + ', ' print '}' print '\n' # GENERATE THE HAIKU. used_words = list() # Make sure we only use words once. rows = [5, 7, 5] # Set the syllable scheme and row cout. haiku = '' # Allow just one boring word per line, and don't let the line end in a boring word. boring_words = ['i', 'the', 'of', 'and', 'to', 'or', 'a', 'on', 'at', 'for', 'is', 'in', 'your', 'you', 'it', 'that', 'this'] # Figure out how many haikus to write on the topic. haikus_wanted = 1 if len(sys.argv) >= 3: haikus_wanted = int(sys.argv[2]) for i in range(haikus_wanted): # Build each row. for syllables_wanted in rows: syllables_in_row = 0 used_boring = 0 # Go through each tag, sorted by frequency. for tag in sorted(tags.items(), key = itemgetter(1), reverse = True): word = tag[0] # Make sure the word won't put us past the syllable limit for the row. syllables_in_word = count_syllables(strip_punctuation(word)) tentative_syllables_in_row = syllables_in_row + syllables_in_word if (word in boring_words) and used_boring: # don't get to use more than one boring word per line. if debug: print word + ' is boring, and we already spent our boring allowance, skip it' continue if word in used_words: # keep looking if we already used the word continue if (tentative_syllables_in_row == syllables_wanted) and (word in boring_words): # don't end a line with a boring word continue if tentative_syllables_in_row <= syllables_wanted: if word in boring_words: if debug: print word + ' is boring...' used_boring = 1 # we used our boring card for this line... haiku += word + ' ' syllables_in_row = tentative_syllables_in_row used_words.append(word) # Optionally print syllable count for a sanity check. if debug: print "\"" + word + "\" has " + str(syllables_in_word) haiku = haiku.strip() + '\n' # Keep writing haikus! haiku += '\n' print haiku.strip()