Relational Algebra & Metakit
by Brian Kelley



Listing One

import metakit
# create an in-memory storage
st = metakit.storage()
# create a simple base view
view = st.getas("test[first:S,last:S,age:I]")
# add some data
view.append(("Joe", "Schmoe", 32))
view.append({"first":"Moe", "last":"Schmoe", "age":26})
class Loader:
    def __init__(self, first, last, age):
        self.first = first
        self.last = last
        self.age = age
row = Loader("Zoe", "Schmoe", 22)
view.append(row)
# show the view
metakit.dump(view)

Listing Two

import metakit
# create an in-memory storage
st = metakit.storage()
# views can be created with with column names that are not valid Python 
# attribute names. Includes anything that has spaces or starts with a letter
view = st.getas("bad_attributes[bad name1:S,1fake:S]")
view.append(("a", "b"))
view.append(("c", "d"))
view.append(("e", "f"))

metakit.dump(view)

# these attributes cannot be accessed through attributes as in view.1fake
print
print "1fake column that cannot be retrieved as a python attribute"
# for columns that cannot be accessed directly, use the getattr method
for row in view:
    value = getattr(row, "1fake")
    print "1fake", value
# to get all the attributes in a view, use the view.structure() method
print
print "Column Types"
columns = view.structure()
for column in columns:
    print "%s:%s"%(column.name, column.type)

Listing Three

import metakit
st = metakit.storage()
vw = st.getas("A[key:I,name:S]")
vw2 = st.getas("B[key:I,org:S]")
vw.append((1, "Dave"))
vw.append((2, "Steve"))
vw.append((3, "Brian"))
vw2.append((1, "Royal Society"))
vw2.append((2, "American Chemical Society"))
vw2.append((4, "PNAS"))
# first form the subview where vw.key == vw2.key only use the key value
keys = vw.join(vw2, vw.key).project(vw.key)
# get all rows from vw that are not in the subview
result = vw.minus(keys.join(vw, vw.key))

print metakit.dump(result)
metakit.dump(vw.join(vw2, vw2.key))
metakit.dump(vw.join(vw2, vw2.key, outer=1))
metakit.dump(vw2.join(vw, vw.key))
metakit.dump(vw2.join(vw, vw.key, outer=1))

# how to find all entries in societies that are not used
metakit.dump(vw2.join(vw, vw.key, outer=1).minus(vw2.join(vw, vw.key)))


Listing Four

import metakit

st = metakit.storage()

view1 = st.getas("view1[a:I,c:I]")
view2 = st.getas("view2[b:I,d:I]")
view1.append((1,4))
view1.append((2,5))
view1.append((3,6))

view2.append((1,4))
view2.append((2,1))
view2.append((3,6))

print "view1"
metakit.dump(view1)
print
print "view2"
metakit.dump(view2)

# We would like to emulate the following SQL statement
# SELECT * from VIEW1,VIEW2 where view1.a=view2.b and view1.c > view2.d

# To accomplish this we need to form the combination of all rows
# between view1 and view2 and check each of them to see if
# they satisfy view1.a=view2.b and view1.c > view2.d
product = view1.product(view2)

# we can create a filter function that returns true
# when our conditions are met.
def filter(row):
    return row.a==row.b and row.c > row.d

# The filter function returns a view with the indices
# that return true for the filter function.
resultindices = product.filter(filter)

# We can generate this view through remapwith
result = product.remapwith(resultindices)

# and now we have our result!
print
print "SELECT * from VIEW1,VIEW2 where view1.a=view2.b and view1.c > view2.d"
metakit.dump(result)


Listing Five

import metakit

# create an in-memory storage
st = metakit.storage()
view = st.getas("test[item:S,count:I,location:S]")
# add some data
view.append({"item":"Pencil #2 Lead", "count":2, "location":"Cubby 1"})
view.append({"item":"Pencil #2 Lead", "count":1, "location":"Desk 3"})
view.append({"item":"Eraser", "count":1, "location":"Desk 2"})
view.append({"item":"Eraser", "count":2, "location":"Cubby 5"})
# sort the view first ascending by item and descending by count to break ties,
# i.e. we want to have the items in order, but show the highest counts first.

sortedView = view.sortrev([view.item,view.count], [view.count])
metakit.dump(sortedView)

Listing Six

"""This example shows various uses of metakit joins"""

import metakit
st = metakit.storage()

# simple join between two tables
# select a,b,c from vw1,vw2 where vw1.a=vw2.a

vw = st.getas("test1[a:I,b:S]")
vw.append((1, "view1"))
vw2 = st.getas("test2[a:I,c:S]")
vw2.append((1, "view2"))
vw2.append((2, "view2"))
vw2.append((2, "view1"))

print "vw"
metakit.dump(vw)
print
print "vw2"
metakit.dump(vw2)

# perform a simple join on the a column
print
print "select * from vw1, vw2 where vw1.a=vw2.a"
result = vw.join(vw2, vw.a)
metakit.dump(result)

# result should be:
#a  b      c    
# -  -----  -----
# 1  view1  view2
# -  -----  -----
# Total: 1 rows
# more complicated join, essentially select vw1.a, vw1.b from vw1,vw2 
# where vw1.b=vw2.c in this case we need to rename vw2's c column
#  to "b" so we can use the join
print
print "select vw1.a, vw1.b from vw1,vw2 where vw1.b=vw2.c"
result = vw.join(vw2.rename("c", "b"), vw.b)
metakit.dump(result)

# result should be
#a  b    
# -  -----
# 1  view1
# -  -----
# Total: 1 rows

print
print "select * from vw right outer join vw2 where vw.a=vw2.a"
print " this keeps all data from all rows in vw2"
print " note the last two rows have no corresponding match in vw"
print " but they are kept anyway"
result = vw2.join(vw, vw2.a, outer=1)
metakit.dump(result)


Listing Seven

import metakit
st = metakit.storage()
cards = st.getas("card[card_id:I,team,player]")
people = st.getas("person[name,card_id:I]")
cards.append((1, "Red Sox", "Pedro Martinez"))
cards.append((2, "Red Sox", "Trot Nixon"))
# Steve is a fanatic, he has multiple cards
people.append(("Steve", 1))
people.append(("Steve", 2))
people.append(("Steve", 1))
people.append(("Brian", 2))
cardholders = people.groupby(people.name, "cards")
for person in cardholders:
    if len(cards.project(cards.card_id).minus(person.cards)) == 0:
        print person.name

Listing Eight

import metakit
import time, random

# create an in-memory storage
st = metakit.storage()

# We have a bunch of time series data that we want to analyze.  Each datapoint has a time,
# and a value and is associated with a given trajectory id.

# the standard way to define this table is as follows:

vw = st.getas("trajectory[trajectory_id:I,time:S,value:F]")
for i in range(10000):
    vw.append({"trajectory_id":1, "time":time.asctime(), "value":random.random()})

# and then to get to a particular trajectory we would perform
# a selection

t1 = time.time()
trajectory = vw.select(trajectory_id=1)
t2 = time.time()
print "Retrieved trajectory 1 in %0.2f seconds"%(t2-t1)

# however, with subviews, we can simply define the following table
vw = st.getas("trajectory2[trajectory_id:I,data[time:S,value:F]]")

# now to append data, we first append the trajectory

vw.append({"trajectory_id":1})

# retrieve the row with the new trajectory id
# this both saves data in the table and also retrieves the trajectories
# much faster than the previous select.
row = vw.select(trajectory_id=1)[0]

# and append the data

for i in range(10000):
    row.data.append({"time":time.asctime(), "value":random.random()})

t1 = time.time()
row = vw.select(trajectory_id=1)[0]
t2 = time.time()
print "Retrieved trajectory 1 in %0.2f seconds"%(t2-t1)


Listing Nine

import metakit

# we are forming two tables, A and B. A holds the KEY and the people 
# associated with KEY. B holds the KEY and the organization associated with 
# KEY. We want to find all the people in A who do not have an entry in B, i.e.
# select * from A where A.KEY not in (select KEY from B)

st = metakit.storage()
A = st.getas("A[key:I,name:S]")
B = st.getas("B[key:I,org:S]")

A.append((1, "Dave"))
A.append((2, "Steve"))
A.append((3, "Brian"))

B.append((1, "Royal Society"))
B.append((2, "American Chemical Society"))
B.append((4, "PNAS"))

akeyNotInB = A.project(A.KEY).minus(B.project(B.KEY))
metakit.dump(akeyNotInB)

# now join keys back to A so we can get the whole rows back. These relational 
# operators add a lot more flexibility than the SQL statements above.
result = A.join(akeyNotInB, A.KEY)
metakit.dump(result)






4


