Jump to content

Go Programming Language Cookbook/Bit map

From Wikibooks, open books for an open world

Given a big file of IPv4 addresses (more than 100 GB) — we need to count unique addresses. If we use generic map[string]bool — we will need more than 64 GB of RAM, so lets use the bit map:

package main

import (
	"bufio"
	"fmt"
	"math/bits"
	"os"
)

// bitsetSize is the number of bytes needed for 2^32 bits (512 MB)
const bitsetSize = 1 << 29

func main() {
	// Open the input file
	file, err := os.Open("ip_addresses")
	if err != nil {
		fmt.Println("Error opening file:", err)
		return
	}
	defer file.Close()

	// Allocate the bitset
	bitset := [bitsetSize]byte{}

	// Use a buffered scanner with a larger buffer
	scanner := bufio.NewScanner(file)
	const maxBuffer = 64 * 1024 // 64 KB buffer
	buf := make([]byte, 0, maxBuffer)
	scanner.Buffer(buf, maxBuffer)

	// Process each line
	for scanner.Scan() {
		line := scanner.Bytes() // Use bytes directly, avoid string conversion

		// Parse the IP address manually from bytes
		ip := parseIPv4(line)
		// Set the bit
		byteIndex := ip >> 3 // Divide by 8
		bitIndex := ip & 7   // Bit position 0-7
		bitset[byteIndex] |= 1 << bitIndex
	}

	// Check for scanning errors
	if err := scanner.Err(); err != nil {
		fmt.Println("Error reading file:", err)
		return
	}

	// Count set bits
	var count uint64
	for i := 0; i < bitsetSize; i++ {
		count += uint64(bits.OnesCount8(bitset[i]))
	}

	fmt.Println("Number of unique IPv4 addresses:", count)
}

func parseIPv4(line []byte) uint32 {
	var ip uint32
	pos := 0

	// Octet 1
	n := uint32(line[pos] - '0')
	pos++
	for pos < len(line) && line[pos] != '.' {
		n = n*10 + uint32(line[pos]-'0')
		pos++
	}
	ip |= n << 24
	pos++ // Skip the dot

	// Octet 2
	n = uint32(line[pos] - '0')
	pos++
	for pos < len(line) && line[pos] != '.' {
		n = n*10 + uint32(line[pos]-'0')
		pos++
	}
	ip |= n << 16
	pos++ // Skip the dot

	// Octet 3
	n = uint32(line[pos] - '0')
	pos++
	for pos < len(line) && line[pos] != '.' {
		n = n*10 + uint32(line[pos]-'0')
		pos++
	}
	ip |= n << 8
	pos++ // Skip the dot

	// Octet 4
	n = uint32(line[pos] - '0')
	pos++
	for pos < len(line) {
		n = n*10 + uint32(line[pos]-'0')
		pos++
	}
	ip |= n

	return ip
}