Why is Go slower than Python for this parallel math code?

I was recently messing around with one of my old Project Euler solutions (specifically, problem 215) to test PyPy's new software transactional memory feature, and decided to port it to Go to see how the code compared.

The first question I had was how to do generators in Go. Python has the magic yield statement, so you can do things like this:

def gen_ways(width, blocks):
    """Generate all possible permutations of items in blocks that add up
    to width, as strings."""
    if width == 0:
        yield ""
    else:
        for block in blocks:
            if block <= width:
                for way in gen_ways(width - block, blocks):
                    yield str(block) + way

The Go equivalent is a function that returns an output-only unbuffered channel, which returns the output from a goroutine. It's not quite as terse, but then it didn't require adding extra support to the language, either.

/* Generate all possible permutations of items in blocks that add up to width,
as strings. */
func gen_ways(width int, blocks []int) chan string {
    out := make(chan string)
    go func() {
        if width == 0 {
            out <- ""
        } else {
            for _, block := range blocks {
                if block <= width {
                    for way := range gen_ways(width-block, blocks) {
                        out <- strconv.Itoa(block) + way
                    }
                }
            }
        }
        close(out)
    }()
    return out
}

Longer, but it's very nice to be able to do this without an additional keyword. One gripe: the channel is logically output-only, but the code doesn't work unless I make it bidirectional. (Because the function is recursive?)

The next function to port looked like this:

def build_mirrors(ways):
    mirrored = set()
    mirrors = set()
    for way in ways:
        rev = "".join(reversed(way))
        if way != rev:
            low, high = sorted([way, rev])
            mirrored.add(low)
            mirrors.add(high)
    return mirrored, mirrors

Go lacks a builtin set. I considered just building my own with a map, but instead pulled in github.com/deckarep/golang-set. Which worked okay. One annoyance was that I couldn't just declare a set and have a useful set; I had to call mapset.NewSet(). Can't really fault the author of the set module for that problem, though, since Go's builtin maps and arrays and chans have the same problem. Zero values that give runtime errors when you try to use them aren't very useful. So much for static typing helping find errors at compile time. Anyway, I ended up with this:

func build_mirrors(ways []string) (mirrored, mirrors mapset.Set) {
    mirrored = mapset.NewSet()
    mirrors = mapset.NewSet()
    for _, way := range ways {
        rev := reverse(way)
        if way < rev {
            mirrored.Add(way)
            mirrors.Add(rev)
        } else if rev < way {
            mirrored.Add(rev)
            mirrors.Add(way)
        }
    }
    return
}

The next concept I had to translate was a group of processes or threads running in parallel, reading from a common input queue and writing to a common results queue. Here's the Python function:

def count_combos(in_queue, out_queue, compatible_ways):
    """Read tuples of (height, prev_way) from in_queue, call gen_combos
    on each, count the number of results, and put the count on out_queue."""
    while True:
        (height, prev_way) = in_queue.get()
        if height is None:
            return
        count = 0
        for combo in gen_combos(height, prev_way, compatible_ways):
            count += 1
        out_queue.put((height, prev_way, count))

and the blob of code that calls it:

in_queue = multiprocessing.Queue()
out_queue = multiprocessing.Queue()
cpus = multiprocessing.cpu_count()

half = (height - 1) // 2
for way in ways:
    if way not in mirrors:
        in_queue.put((half, way))
# sentinels
for unused in xrange(cpus):
    in_queue.put((None, None))
procs = []
for unused in xrange(cpus):
    proc = multiprocessing.Process(target=count_combos,
                                   args=(in_queue,
                                         out_queue,
                                         compatible_ways))
    proc.daemon = True
    proc.start()
    procs.append(proc)

I think the called code is fine, but the calling code is kind of ugly, because of the way I need to manually track the process objects to clean them up later. Also, it might have been better with more explicit sentinels than just lazily using None.

The Go version of the called code is similar, except that I had to setup some structs rather than just passing tuples around.

type (
    height_way struct {
        height int
        way    string
    }
    height_way_count struct {
        height int
        way    string
        count  uint64
    }
)

const SENTINEL_HEIGHT = -1

/* Read height_way structs from in_queue, call gen_combos
   on each, count the number of results, and put height_way_count
   structs on out_queue. */
func count_combos(in_queue <-chan height_way,
    out_queue chan<- height_way_count,
    compatible_ways map[string][]string) {
    for {
        hw := <-in_queue
        height := hw.height
        way := hw.way
        if height == SENTINEL_HEIGHT {
            return
        }
        var count uint64
        for _ = range gen_combos(height, way, compatible_ways) {
            count += 1
        }
        out_queue <- height_way_count{height, way, count}
    }
}

The Go version of the calling code is nicer, because goroutines are builtin syntax so you don't need to do manual process management.

    const QUEUE_SIZE = 9999

    in_queue := make(chan height_way, QUEUE_SIZE)
    out_queue := make(chan height_way_count, QUEUE_SIZE)

    half := (height - 1) / 2
    for _, way := range ways {
        if !mirrors.Contains(way) {
            in_queue <- height_way{half, way}
        }
    }
    for ii := 0; ii < maxprocs; ii++ {
        in_queue <- height_way{SENTINEL_HEIGHT, ""}
    }
    for ii := 0; ii < maxprocs; ii++ {
        go count_combos(in_queue, out_queue, compatible_ways)
    }

It was surprisingly tricky to make this work without deadlocking. First I forgot to set the channel size, which meant I had blocking channels, which immediately blocked when I put data on them, since the goroutine to receive the data wasn't running yet. Unbuffered and buffered channels are fundamentally different beasts.

Finally, when I got everything working, the Go program only ran on a single CPU core. I remembered seeing GOMAXPROCS in the docs, and doing "GOMAXPROCS=4 ./euler215" worked. But when I tried setting it from inside the program, like "runtime.GOMAXPROCS = 4", the variable changed but the program never actually used multiple cores. So runtime.GOMAXPROCS is an attractive nuisance; it's documented, and looks like it should work, but (at least in Go 1.2.1 on Ubuntu) doesn't really. (This may be issue 1492 , but that bug is listed as fixed.)

Anyway, once the program gave the correct answer on the trivial problem size (width 9, height 3), it was time to run it with the full problem size (width 32, height 10). On my (slow) 3 GHz Phenom II quad-core, CPython 2.7 takes 21:13 and PyPy 2.3.1 takes 7:06, so I was figuring the Go version should finish in a couple of minutes.

Nope. Half an hour later, I was wondering what was wrong. An hour later, I was still wondering. The program did finish eventually, and gave the right answer (once I changed a few counters from int to uint64 to keep them from rolling over), but it took 81 minutes. Almost 4 times as slow as CPython, and almost 12 times as slow as PyPy.

So I instrumented the Go version for profiling, following the directions from the Go blog. After my program was profiled, I ran it with "time GOMAXPROCS=4 ./euler215 -cpuprofile prof.out -width 25 -height 10" so it would run fairly quickly, then ran "go tool pprof ./euler215 prof.out" to enter the interactive profiler, then "top25" to see:

(pprof) top25
Total: 2883 samples
     226   7.8%   7.8%      226   7.8% scanblock
     221   7.7%  15.5%      221   7.7% etext
     218   7.6%  23.1%      739  25.6% runtime.mallocgc
     195   6.8%  29.8%      195   6.8% runtime.xchg
     185   6.4%  36.2%      191   6.6% runtime.settype_flush
     181   6.3%  42.5%     1631  56.6% main.func·002
     173   6.0%  48.5%      173   6.0% sweepspan
     146   5.1%  53.6%      146   5.1% runtime.casp
     112   3.9%  57.5%      208   7.2% runtime.markallocated
     108   3.7%  61.2%      770  26.7% cnew
      98   3.4%  64.6%       98   3.4% markonly
      95   3.3%  67.9%      692  24.0% runtime.growslice
      92   3.2%  71.1%       92   3.2% runtime.memclr
      70   2.4%  73.5%      330  11.4% runtime.makeslice
      53   1.8%  75.4%      101   3.5% runtime.unlock
      51   1.8%  77.1%      166   5.8% runtime.chansend
      45   1.6%  78.7%      122   4.2% runtime.lock
      43   1.5%  80.2%      159   5.5% runtime.chanrecv
      35   1.2%  81.4%      597  20.7% growslice1
      31   1.1%  82.5%       31   1.1% runtime.memmove
      29   1.0%  83.5%       29   1.0% schedule
      25   0.9%  84.4%       25   0.9% park0
      24   0.8%  85.2%       24   0.8% flushptrbuf
      23   0.8%  86.0%       43   1.5% runtime.mapaccess1_faststr
      17   0.6%  86.6%       17   0.6% dequeue

Meh. It's spending a lot of time making slices. Growing slices. Clearing memory. Allocating memory. Sending and receiving on channels. Basically, all down inside the Go runtime, not in my code. Not a whole lot of useful information on what to fix. Maybe I'm doing something incorrect with maps that's causing excessive memory churn. Or maybe Go maps are just too slow.

Anyway, I'll dump the full code here, in case anyone wants to see the full example. Here's the Python program:

#!/usr/bin/env python

"""Project Euler, problem 215

Consider the problem of building a wall out of 2x1 and 3x1 bricks
(horizontal vertical dimensions) such that, for extra strength, the gaps
between horizontally adjacent bricks never line up in consecutive layers, i.e.
never form a "running crack".

For example, the following 9x3 wall is not acceptable due to the running
crack shown in red:

3222
2232
333

There are eight ways of forming a crack-free 9x3 wall, written W(9,3) = 8.

Calculate W(32,10).
"""

import sys
import multiprocessing

try:
    import psyco
    psyco.full()
except ImportError:
    pass


sys.setrecursionlimit(100)


def gen_ways(width, blocks):
    """Generate all possible permutations of items in blocks that add up
    to width, as strings."""
    if width == 0:
        yield ""
    else:
        for block in blocks:
            if block <= width:
                for way in gen_ways(width - block, blocks):
                    yield str(block) + way


def build_mirrors(ways):
    mirrored = set()
    mirrors = set()
    for way in ways:
        rev = "".join(reversed(way))
        if way != rev:
            low, high = sorted([way, rev])
            mirrored.add(low)
            mirrors.add(high)
    return mirrored, mirrors


def find_cracks(way):
    """Return the set of indexes where cracks occur in way"""
    result = set()
    total = 0
    for ch in way[:-1]:
        total += int(ch)
        result.add(total)
    return result


def crack_free(tup1, tup2, cracks):
    """Return True iff tup1 and tup2 can be adjacent without making a
    crack."""
    return not cracks[tup1].intersection(cracks[tup2])


def find_compatible_ways(way, ways, cracks):
    """Return a list of crack-free adjacent ways for way"""
    result = []
    for way2 in ways:
        if crack_free(way, way2, cracks):
            result.append(way2)
    return result


def build_compatible_ways(ways):
    cracks = {}
    for way in ways:
        cracks[way] = find_cracks(way)
    print "done generating %d cracks" % len(cracks)
    compatible_ways = {}
    compatible_ways[()] = ways
    for way in ways:
        compatible_ways[way] = find_compatible_ways(way, ways, cracks)
    return compatible_ways


def gen_combos(height, prev_way, compatible_ways):
    """Generate all ways to make a crack-free wall of size (width, height),
    as height-lists of width-strings."""
    if height == 0:
        return
    elif height == 1:
        for way in compatible_ways[prev_way]:
            yield [way]
    else:
        for way in compatible_ways[prev_way]:
            for combo in gen_combos(height - 1, way, compatible_ways):
                yield [way] + combo


def count_combos(in_queue, out_queue, compatible_ways):
    """Read tuples of (height, prev_way) from in_queue, call gen_combos
    on each, count the number of results, and put the count on out_queue."""
    while True:
        (height, prev_way) = in_queue.get()
        if height is None:
            return
        count = 0
        for combo in gen_combos(height, prev_way, compatible_ways):
            count += 1
        out_queue.put((height, prev_way, count))


def count_combos_memo(in_queue, out_queue, compatible_ways, memo):
    """Read tuples of (height, prev_way) from in_queue, call gen_combos
    on each, chain the result of memo to the last result in the combo
    to get the total count, and put the count on out_queue."""
    while True:
        (height, prev_way) = in_queue.get()
        if height is None:
            return
        count = 0
        for combo in gen_combos(height, prev_way, compatible_ways):
            last = combo[-1]
            count += memo[last]
        out_queue.put((height, prev_way, count))


def W(width, height):
    """Return the number of ways to make a crack-free wall of size
    (width, height)."""
    ways = sorted(gen_ways(width, [2, 3]))
    print "done generating %d ways" % len(ways)

    mirrored, mirrors = build_mirrors(ways)
    print "have %d mirror images " % (len(mirrored))

    compatible_ways = build_compatible_ways(ways)
    print "done generating %d compatible_ways" % sum(map(
        len, compatible_ways.itervalues()))

    in_queue = multiprocessing.Queue()
    out_queue = multiprocessing.Queue()

    cpus = multiprocessing.cpu_count()

    half = (height - 1) // 2
    for way in ways:
        if way not in mirrors:
            in_queue.put((half, way))
    # sentinels
    for unused in xrange(cpus):
        in_queue.put((None, None))
    procs = []
    for unused in xrange(cpus):
        proc = multiprocessing.Process(target=count_combos,
                                       args=(in_queue,
                                             out_queue,
                                             compatible_ways))
        proc.daemon = True
        proc.start()
        procs.append(proc)

    half_memo = {}

    num_ways = len(ways) - len(mirrors)
    for ii in xrange(num_ways):
        (unused, prev_way, count) = out_queue.get()
        half_memo[prev_way] = count
        if prev_way in mirrored:
            half_memo["".join(reversed(prev_way))] = count
        print "(%d/%d) %s mirrored=%d count=%d" % (
            ii + 1, num_ways, prev_way, prev_way in mirrored, count)

    for proc in procs:
        proc.join()

    rest = (height - 1) - half

    for way in ways:
        if way not in mirrors:
            in_queue.put((rest, way))
    # sentinels
    for unused in xrange(cpus):
        in_queue.put((None, None))
    procs = []
    for unused in xrange(cpus):
        proc = multiprocessing.Process(target=count_combos_memo, args=(
                                       in_queue, out_queue, compatible_ways,
                                       half_memo))
        proc.daemon = True
        proc.start()
        procs.append(proc)

    total = 0
    for ii in xrange(num_ways):
        (unused, prev_way, count) = out_queue.get()
        if prev_way in mirrored:
            count *= 2
        total += count
        print "(%d/%d) %s mirrored=%d count=%d total=%d" % (
            ii + 1, num_ways, prev_way, prev_way in mirrored, count, total)
    for proc in procs:
        proc.join()
    return total


def main():
    try:
        width = int(sys.argv[1])
    except IndexError:
        width = 32
    try:
        height = int(sys.argv[2])
    except IndexError:
        height = 10
    print W(width, height)


if __name__ == "__main__":
    main()

And here's the Go version:

/* Project Euler, problem 215

Consider the problem of building a wall out of 2x1 and 3x1 bricks
(horizontal vertical dimensions) such that, for extra strength, the gaps
between horizontally adjacent bricks never line up in consecutive layers, i.e.
never form a "running crack".

For example, the following 9x3 wall is not acceptable due to the running
crack shown in red:

3222
2232
333

There are eight ways of forming a crack-free 9x3 wall, written W(9,3) = 8.

Calculate W(32,10).
*/

package main

import (
    "fmt"
    "github.com/deckarep/golang-set"
    "os"
    "runtime"
    "strconv"
    "flag"
    "log"
    "runtime/pprof"
)

type (
    height_way struct {
        height int
        way    string
    }
    height_way_count struct {
        height int
        way    string
        count  uint64
    }
)

const SENTINEL_HEIGHT = -1
const QUEUE_SIZE = 9999

/* Generate all possible permutations of items in blocks that add up to width,
as strings. */
func gen_ways(width int, blocks []int) chan string {
    out := make(chan string)
    go func() {
        if width == 0 {
            out <- ""
        } else {
            for _, block := range blocks {
                if block <= width {
                    for way := range gen_ways(width-block, blocks) {
                        out <- strconv.Itoa(block) + way
                    }
                }
            }
        }
        close(out)
    }()
    return out
}

func reverse(str string) (result string) {
    for _, ch := range str {
        result = string(ch) + result
    }
    return
}

func build_mirrors(ways []string) (mirrored, mirrors mapset.Set) {
    mirrored = mapset.NewSet()
    mirrors = mapset.NewSet()
    for _, way := range ways {
        rev := reverse(way)
        if way < rev {
            mirrored.Add(way)
            mirrors.Add(rev)
        } else if rev < way {
            mirrored.Add(rev)
            mirrors.Add(way)
        }
    }
    return
}

/* Return the set of indexes where cracks occur in way */
func find_cracks(way string) (result mapset.Set) {
    result = mapset.NewSet()
    total := 0
    for ii := 0; ii < len(way)-1; ii++ {
        str1 := way[ii : ii+1]
        num, _ := strconv.Atoi(str1)
        total += num
        result.Add(total)
    }
    return result
}

/* Return True iff tup1 and tup2 can be adjacent without making a crack. */
func crack_free(tup1 string, tup2 string, cracks map[string]mapset.Set) bool {
    return cracks[tup1].Intersect(cracks[tup2]).Cardinality() == 0
}

/* Return a list of crack-free adjacent ways for way */
func find_compatible_ways(way string,
    ways []string,
    cracks map[string]mapset.Set) (result []string) {
    for _, way2 := range ways {
        if crack_free(way, way2, cracks) {
            result = append(result, way2)
        }
    }
    return result
}

func build_compatible_ways(ways []string) map[string][]string {
    cracks := make(map[string]mapset.Set)
    for _, way := range ways {
        cracks[way] = find_cracks(way)
    }
    fmt.Printf("done generating %d cracks\n", len(cracks))
    compatible_ways := make(map[string][]string)
    compatible_ways[""] = ways
    for _, way := range ways {
        compatible_ways[way] = find_compatible_ways(way, ways, cracks)
    }
    return compatible_ways
}

/* Generate all ways to make a crack-free wall of size (width, height),
   as height-arrays of width-strings. */
func gen_combos(height int,
    prev_way string,
    compatible_ways map[string][]string) chan []string {
    out := make(chan []string)
    go func() {
        if height == 0 {
            return
        } else if height == 1 {
            for _, way := range compatible_ways[prev_way] {
                wayarray := []string{way}
                out <- wayarray
            }
        } else {
            for _, way := range compatible_ways[prev_way] {
                for combo := range gen_combos(height-1, way, compatible_ways) {
                    wayarray := make([]string, height)
                    wayarray = append(wayarray, way)
                    for _, x := range combo {
                        wayarray = append(wayarray, x)
                    }
                    out <- wayarray
                }
            }
        }
        close(out)
    }()
    return out
}

/* Read height_way structs from in_queue, call gen_combos
   on each, count the number of results, and put height_way_count
   structs on out_queue. */
func count_combos(in_queue <-chan height_way,
    out_queue chan<- height_way_count,
    compatible_ways map[string][]string) {
    for {
        hw := <-in_queue
        height := hw.height
        way := hw.way
        if height == SENTINEL_HEIGHT {
            return
        }
        var count uint64
        for _ = range gen_combos(height, way, compatible_ways) {
            count += 1
        }
        out_queue <- height_way_count{height, way, count}
    }
}

/* Read tuples of (height, prev_way) from in_queue, call gen_combos
   on each, chain the result of memo to the last result in the combo
   to get the total count, and put the count on out_queue. */
func count_combos_memo(in_queue <-chan height_way,
    out_queue chan<- height_way_count,
    compatible_ways map[string][]string,
    memo map[string]uint64) {
    for {
        hw := <-in_queue
        height := hw.height
        way := hw.way
        if height == SENTINEL_HEIGHT {
            return
        }
        var count uint64
        for combo := range gen_combos(height, way, compatible_ways) {
            last := combo[len(combo)-1]
            count += memo[last]
        }
        out_queue <- height_way_count{height, way, count}
    }
}

/* Return the number of ways to make a crack-free wall of size
   (width, height). */
func W(width, height int) uint64 {
    var ways []string
    for way := range gen_ways(width, []int{2, 3}) {
        ways = append(ways, way)
    }
    fmt.Printf("done generating %d ways\n", len(ways))

    mirrored, mirrors := build_mirrors(ways)
    fmt.Printf("have %d mirror images\n", mirrored.Cardinality())

    compatible_ways := build_compatible_ways(ways)
    var total uint64
    for _, way := range compatible_ways {
        total += uint64(len(way))
    }
    fmt.Printf("done generating %d compatible_ways\n", total)

    in_queue := make(chan height_way, QUEUE_SIZE)
    out_queue := make(chan height_way_count, QUEUE_SIZE)

    half := (height - 1) / 2
    for _, way := range ways {
        if !mirrors.Contains(way) {
            in_queue <- height_way{half, way}
        }
    }
    maxprocs := runtime.NumCPU()
    // XXX This doesn't seem to work reliably in Go 1.2.1; set GOMAXPROCS
    // environment variable instead of doing it after the program starts.
    // https://code.google.com/p/go/issues/detail?id=1492 ?
    runtime.GOMAXPROCS(maxprocs)
    for ii := 0; ii < maxprocs; ii++ {
        in_queue <- height_way{SENTINEL_HEIGHT, ""}
    }
    for ii := 0; ii < maxprocs; ii++ {
        go count_combos(in_queue, out_queue, compatible_ways)
    }

    half_memo := make(map[string]uint64)

    num_ways := len(ways) - mirrors.Cardinality()
    fmt.Println("num_ways", num_ways)
    for ii := 0; ii < num_ways; ii++ {
        hwc := <-out_queue
        prev_way := hwc.way
        count := hwc.count
        half_memo[prev_way] = count
        if mirrored.Contains(prev_way) {
            half_memo[reverse(prev_way)] = count
        }
        fmt.Printf("(%d/%d) %s mirrored=%v count=%d\n", ii+1, num_ways,
            prev_way, mirrored.Contains(prev_way), count)
    }

    rest := (height - 1) - half

    in_queue2 := make(chan height_way, QUEUE_SIZE)
    out_queue2 := make(chan height_way_count, QUEUE_SIZE)

    for _, way := range ways {
        if !mirrors.Contains(way) {
            in_queue2 <- height_way{rest, way}
        }
    }
    for ii := 0; ii < maxprocs; ii++ {
        in_queue2 <- height_way{SENTINEL_HEIGHT, ""}
    }
    for ii := 0; ii < maxprocs; ii++ {
        go count_combos_memo(in_queue2, out_queue2, compatible_ways, half_memo)
    }

    var total2 uint64
    for ii := 0; ii < num_ways; ii++ {
        hwc := <-out_queue2
        prev_way := hwc.way
        count := hwc.count
        if mirrored.Contains(prev_way) {
            count *= 2
        }
        total2 += uint64(count)
        fmt.Printf("(%d/%d) %s mirrored=%v count=%d total2=%d\n", ii+1,
            num_ways, prev_way, mirrored.Contains(prev_way), count, total2)
    }
    return total2
}

func main() {
    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
    var width = flag.Int("width", 32, "width in blocks")
    var height = flag.Int("height", 10, "height in blocks")
    flag.Parse()

    if *cpuprofile != "" {
        f, err := os.Create(*cpuprofile)
        if err != nil {
            log.Fatal(err)
        }
        pprof.StartCPUProfile(f)
        defer pprof.StopCPUProfile()
    }

    fmt.Println(W(*width, *height))
}