Jump to content

User:Kellen/Scripts

From Wikibooks, open books for an open world

Scripts I've written for manipulating wikibooks data. Be sure to copy from the edit box, not the rendered page, as some html entities are used.

No guarantee of efficiency, correctness, or beauty. A guarantee of hacky sed regular expressions.

Isolate cookbook wantedpages:

#!/bin/bash

lynx -source -dump "http://en.wikibooks.org/w/index.php?title=Special:Wantedpages&limit=5000&offset=0" |  \
        grep Cookbook | \
        grep edit | \
        grep -v "Talk:Cookbook" | \
        grep -v "Cookbook_talk:" | \
        grep -v "<s>" | \
        sed 's/<[/li]\{2,3\}>//g' | \
        sed 's/_/ /g' | \
        sed 's/(<a/\n(<a/g' | \
        sed 's/&action=edit//g' | \
        sed 's/^<a href="\/w\/index.php?title=\(Cookbook:\([^"]*\)\)".*$/[[\1|\2]]/g' | \
        sed 's/<a href="\/w\/index.php?title=\([^"]*\)"[^>]*>\([^<]*\)[^)]*/[[\1|\2]]/g' | \
        sed 's/&target=/\//g' | \
        sed '1~2N;s/\n/ /g' | \
        sed 's/^/*/g'

Get redirects from cookbook allpages:

#!/bin/bash

# filename prefix
FILE=cookbookredirs
# servername
SERVER="http://en.wikibooks.org"
# start page
PAGE="/w/index.php?title=Special%3AAllpages&from=&namespace=102"

# fetch all pages
num=0
while [ -n "$PAGE" ]
do
  num=$(($num+1))
  echo "Getting page number ${num}, ${PAGE}"
  wget -q -O "${FILE}.${num}" "${SERVER}${PAGE}"

  # get the next page url
  PAGE=`grep -o '<a href="[^"]*" title="[^"]*">Next page ([^)]*)' "${FILE}.${num}" | grep -o 'href="[^"]*"' | grep -o '"[^"]*"' | sed 's/"//g' | sed 's/&/\&/g'`
  #if [ -n "$PAGE" ]; then
  #  echo "Next page is ${PAGE}"
  #fi
done

echo "Got ${num} files."

# strip each file down to only redirects
i=0
while [ $i -lt "$num" ]
do
  i=$(($i+1))
  FN="${FILE}.${i}"
  if [ ! -f $FN ]; then
    echo "Can't find ${FN}"
    break
  fi

  # add a marker to beginning of page list
  sed -i 's/<table style="background: inherit;" border="0" width="100%">/\nBREAKHERE\n/' $FN
  # kill everything above page list marker
  sed -i '0,/BREAKHERE/d' $FN
  # find end of page list and kill everything after
  sed -i 's/<\/table>/\n/1' $FN
  sed -i '2,$d' $FN
  # add a linebreak after each item, replacing /td
  sed -i 's|</td>|\n|g' $FN
  # remove all remaining tr's td's and ending tr's
  sed -i 's|<[trd/]\{2,3\}>||g' $FN
  # strip down to just title
  sed -i 's/<a href="\/wiki\/\([^"]*\)" title="\([^"]*\)">.*$/\1\n/g' $FN
  # only get redirects
  sed -i -n '/allpagesredirect/p' $FN
  sed -i 's/<div class="allpagesredirect">\(.*\)$/* [[\1]]/g' $FN
done

# Join files together
i=0
CATFILES=""
while [ $i -lt "$num" ]
do
  i=$(($i+1))
  CATFILES="${CATFILES} ${FILE}.${i}"
done
FINAL="${FILE}.final"
cat $CATFILES > $FINAL
rm $CATFILES

# add a blank line so we can easily find where to put colunms
lines=`wc -l < $FINAL`
col=`expr $lines / 3`
pattern="${col}~${col}G"
sed -i $pattern $FINAL

echo "Resultant file is ${FINAL}"