Go Programming Language Cookbook/Bit map
Appearance
Given a big file of IPv4 addresses (more than 100 GB) — we need to count unique addresses. If we use generic map[string]bool
— we will need more than 64 GB of RAM, so lets use the bit map:
package main
import (
"bufio"
"fmt"
"math/bits"
"os"
)
// bitsetSize is the number of bytes needed for 2^32 bits (512 MB)
const bitsetSize = 1 << 29
func main() {
// Open the input file
file, err := os.Open("ip_addresses")
if err != nil {
fmt.Println("Error opening file:", err)
return
}
defer file.Close()
// Allocate the bitset
bitset := [bitsetSize]byte{}
// Use a buffered scanner with a larger buffer
scanner := bufio.NewScanner(file)
const maxBuffer = 64 * 1024 // 64 KB buffer
buf := make([]byte, 0, maxBuffer)
scanner.Buffer(buf, maxBuffer)
// Process each line
for scanner.Scan() {
line := scanner.Bytes() // Use bytes directly, avoid string conversion
// Parse the IP address manually from bytes
ip := parseIPv4(line)
// Set the bit
byteIndex := ip >> 3 // Divide by 8
bitIndex := ip & 7 // Bit position 0-7
bitset[byteIndex] |= 1 << bitIndex
}
// Check for scanning errors
if err := scanner.Err(); err != nil {
fmt.Println("Error reading file:", err)
return
}
// Count set bits
var count uint64
for i := 0; i < bitsetSize; i++ {
count += uint64(bits.OnesCount8(bitset[i]))
}
fmt.Println("Number of unique IPv4 addresses:", count)
}
func parseIPv4(line []byte) uint32 {
var ip uint32
pos := 0
// Octet 1
n := uint32(line[pos] - '0')
pos++
for pos < len(line) && line[pos] != '.' {
n = n*10 + uint32(line[pos]-'0')
pos++
}
ip |= n << 24
pos++ // Skip the dot
// Octet 2
n = uint32(line[pos] - '0')
pos++
for pos < len(line) && line[pos] != '.' {
n = n*10 + uint32(line[pos]-'0')
pos++
}
ip |= n << 16
pos++ // Skip the dot
// Octet 3
n = uint32(line[pos] - '0')
pos++
for pos < len(line) && line[pos] != '.' {
n = n*10 + uint32(line[pos]-'0')
pos++
}
ip |= n << 8
pos++ // Skip the dot
// Octet 4
n = uint32(line[pos] - '0')
pos++
for pos < len(line) {
n = n*10 + uint32(line[pos]-'0')
pos++
}
ip |= n
return ip
}