Rcjp's Weblog

October 1, 2007

Simple diff of 2 files

Filed under: c, lisp, python — rcjp @ 7:59 pm

A few days ago I wanted to compare two files each of which had one word per line (they were completion files for the rlwrap utility incidentally) thats easy enough with the unix shell commands, infact you can do it in one line

diff -iyw --suppress-common-lines <(sort -f file1) <(sort -f file2)

but I thought as a quick programming exercise I’d do it in C++/C, python and Common Lisp…

Calculating the difference is very easy using C++’s set_difference etc. but it is a surprise, I think for most programmers anyway, that you have to write your own case insensitive string comparison function. C++ sure is a peculiar mix of high and low level programming.

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <iterator>
#include <functional>

// 
// not sure there is any advantage to inherit from binary_function here?
// (infact we could just define a function rather than a struct and operator)
//
struct lessthan_nocase :
  public std::binary_function<const std::string&, const std::string&, bool>
{
    bool operator()(const std::string& s1, const std::string& s2) const
    {
        std::string::const_iterator p1 = s1.begin();
        std::string::const_iterator p2 = s2.begin();

        while(p1 != s1.end() && p2 != s2.end()) {
            if (toupper(*p1) != toupper(*p2)) return toupper(*p1) < toupper(*p2);
            ++p1;
            ++p2;
        }
        return s1.size() < s2.size();
    }
};

//
// set_difference only work on sorted containers
//
void sorted_readfile(const char* file, std::vector<std::string>& fvec)
{
    std::ifstream f(file);
    std::istream_iterator<std::string> finput(f), fend;

    copy(finput, fend, back_inserter(fvec));
    sort(fvec.begin(), fvec.end(), lessthan_nocase());
}

//
// dump results (show words if < MAXSHOW)
//
const unsigned int MAXSHOW = 10;

void display_diff(std::string title, std::vector<std::string> v)
{
    std::cout << title;
    if (v.size() < MAXSHOW) {
        std::cout << std::endl << std::string(title.size(), '-') << std::endl;
        copy(v.begin(), v.end(),
             std::ostream_iterator<std::string>(std::cout, "\n"));
    } else {
        std::cout << " =  " << v.size() << " words" << std::endl;
    }
    std::cout << std::endl;
}

int main(int argc, char* argv[])
{
    if (argc != 3) {
        std::cerr << "Usage: tdiff filename1 filename2" << std::endl;
        exit(1);
    }

    std::vector<std::string> f1;
    sorted_readfile(argv[1], f1);

    std::vector<std::string> f2;
    sorted_readfile(argv[2], f2);

    std::vector<std::string> notinf1;
    set_difference(f1.begin(), f1.end(), f2.begin(), f2.end(),
            back_inserter(notinf1), lessthan_nocase());
    display_diff("words in 1st but not in 2nd", notinf1);

    std::vector<std::string> notinf2;
    set_difference(f2.begin(), f2.end(), f1.begin(), f1.end(),
            back_inserter(notinf2), lessthan_nocase());
    display_diff("words in 2nd but not in 1st", notinf2);

    std::vector<std::string> symdiff;
    set_symmetric_difference(f2.begin(), f2.end(), f1.begin(), f1.end(),
            back_inserter(symdiff), lessthan_nocase());
    display_diff("symmetric difference", symdiff);

    std::vector<std::string> inter;
    set_intersection(f2.begin(), f2.end(), f1.begin(), f1.end(),
            back_inserter(inter), lessthan_nocase());
    display_diff("intersection", inter);

    return 0;
}

In C, the only thing to trip you up is getting the casting of the void * pointers in strcmp_nocase correct before trying to dereference them. Also I’ve cheated and used the non-ansi strcasecmp (sometimes called stricmp).

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXLINE  256
#define MAXWORDS 10000
#define MAXSHOW  10

typedef enum {false, true} bool;

bool strempty(char *str)
{
    char *p = str;
    while (*p) if (!isspace(*p++)) return false;
    return true;
}

int strcmp_nocase(const void *s1, const void *s2)
{
    /* The actual arguments to this function are "pointers to
       pointers to char", but strcmp() arguments are "pointers
       to char", hence the following cast plus dereference */
    return strcasecmp( *(char * const *)s1, * (char * const *) s2);
}

int sorted_readfile(char *filename, char **words)
{
    FILE *f = fopen(filename, "r");

    if (!f) {
        fprintf(stderr, "can't open %s\n", filename);
        exit(1);
    }

    int count = 0;
    char line[MAXLINE], *q;
    while (fgets(line, MAXLINE, f) && (count < MAXWORDS)) {
        if ((q=strpbrk(line,"\r\n"))) *q=0;     /* words dont include newlines */
        if (strempty(line)) continue;           /* skip blank lines */
        words[count] = (char *) malloc(strlen(line)+1);
        /* or words[count++] = strdup(line)*/
        strcpy(words[count], line);
        count++;
    }

    fclose(f);

    if (count == MAXWORDS) {
        fprintf(stderr, "increase MAXWORDS buffer size\n");
        exit(1);
    }
    qsort(words, count, sizeof(char *), strcmp_nocase);
    return count;
}

void display_diff(char *title, char **v, int vsize)
{
    int i;
    printf("%s", title);

    if (vsize < MAXSHOW) {
        printf("\n");
        for (i=0; i<strlen(title); i++) putchar('-');
        printf("\n");
        for (i=0; i<vsize; i++)
            printf("%s\n", v[i]);
    } else {
        printf(" = %d words\n", vsize);
    }
    printf("\n");
}

int main(int argc, char *argv[])
{
    if (argc != 3) {
        fprintf(stderr, "Usage: tdiff filename1 filename2\n");
        exit(1);
    }
    int i;
    char **f1words;
    f1words = malloc(MAXWORDS*sizeof(char*));
    int nf1words = sorted_readfile(argv[1], f1words);

    char **f2words;
    f2words = malloc(MAXWORDS*sizeof(char*));
    int nf2words = sorted_readfile(argv[2], f2words);

    char **diff;
    int ndiff = 0;
    diff = malloc(MAXWORDS*sizeof(char*));
    for (i=0; i < nf1words; i++) {
        if (!bsearch(&f1words[i], f2words, nf2words,
                     sizeof(char*), strcmp_nocase)) {
            /* could just point into f1words instead */
            diff[ndiff++] = strdup(f1words[i]);
        }
    }
    display_diff("words in 1st but not in 2nd", diff, ndiff);

    return 0;
}

In python, most of it is easy, though its perhaps not very pythonesque to cram some things on one line like I’ve done here. Making set case independent requires the __hash__ and __eq__ functions (I think that is all we need in this case) get overridden to use a saved lowercase version of the supplied string (see the python reference) .

import string

class NoCaseStr(str):
    def __init__(self, s):
        str.__init__(self, s)
        # keep a copy of the lower case string
        self.loweredstr = s.lower()

    def __eq__(self, s):
        return self.loweredstr == s.lower()

    def __hash__(self):
        return hash(self.loweredstr)

def display_diff(title, dset, Maxshow=10):
    if len(dset) < Maxshow:
        print title
        print len(title) * '-'
        print '\n'.join(dset)
    else:
        print title, '=', len(dset)
    print

def read_words(f):
    w = set(NoCaseStr(string.strip(word)) for word in open(f).readlines())
    w.discard('')# slightly clumsy... '  \n' gets stripped to '' so discard it
    return w

def tdiff(f1, f2):
    w1 = read_words(f1)
    w2 = read_words(f2)
    display_diff('in 1st but not in 2nd', w1.difference(w2))
    display_diff('in 2nd but not in 1st', w2.difference(w1))
    display_diff('symmetric difference',  w1.symmetric_difference(w2))
    display_diff('intersection',          w1.intersection(w2))


if __name__ == '__main__':
    import sys
    if len(sys.argv) < 3:
        print 'Usage: %s file1 file2' % sys.argv[0]
        sys.exit(1)

    tdiff(sys.argv[1], sys.argv[2])

My favourite language – Common Lisp has everything you need to do with without overriding anything. string-equal is case insensitive (as opposed to string=)

(defun read-words (file)
  (with-open-file (str file)
    (loop for line = (read-line str nil nil)
       for word = (string-trim " " line)
       while line
       unless (string= word "")
       collect word)))

(defun display-diff (title diff &optional (max-show 10))
  (format t "~A: ~V{~A~^ ~}~%" title max-show diff))

(defun tdiff (f1 f2)
  (let ((w1 (read-words f1))
        (w2 (read-words f2)))
    (display-diff "in 1st but not in 2nd" (set-difference w1 w2 :test #'string-equal))
    (display-diff "in 2nd but not in 1st" (set-difference w2 w1 :test #'string-equal))
    (display-diff "intersection" (intersection w1 w2 :test #'string-equal))))

(tdiff "/home/r/tmp/f1" "/home/r/tmp/f2")

September 20, 2007

Posting Code

Filed under: utils — rcjp @ 5:42 pm

After spending several frustrating sessions trying to find a solution to posting sourcecode on this blog I think I’ve got something workable.

Most of the methods I tried, including the one in the WordPress FAQ, fail on some operator characters found in lisp or C++ (like e.g.<<) and although I don’t use vim much these days, tending to prefer nvi, it does have a handy htmlize function :TOhtml. This, with some tweaking, generates html acceptable to WordPress from a buffer of code (or some lines marked in Visual Mode).

I used some vim scripting given below to strip out the <pre> section from the generated html, wrap <code> just inside it and copy it to the clipboard ready for pasting into the browser. I have it bound to the keystroke Alt-l. One further complication was that WordPress, trying to be helpful, swaps out some characters even though they are in a <code> block, so I do a search/replace for apostrophes.


func Convert2HTMLClip(line1, line2)
    if a:line2 >= a:line1
        let g:html_start_line = a:line1
        let g:html_end_line = a:line2
    else
        let g:html_start_line = a:line2
        let g:html_end_line = a:line1
    endif
    runtime syntax/2html.vim
    unlet g:html_start_line
    unlet g:html_end_line
    normal! gg
    exe ":0,/<pre>/- d"
    normal! A<code>
    normal! G
    exe ":?</pre>?+,$ d"
    normal! I</code>
    exe ":%s/'/\\&#39;/eg"
    exe ":%s/&amp;#39;/\\&amp;\\&#35;39;/eg"
    exe ":%s/&amp;#35;/\\&amp;\\&#35;35;/eg"
    normal! ggVG"+y
    exe ":bd!"
endfunc
command -range=% TohtmlClip :call Convert2HTMLClip(<line1>, <line2>)
map <M-l> :TohtmlClip<CR>

September 12, 2007

Inspirational

Filed under: physics — rcjp @ 4:09 pm

Everytime I look at this image, I have to stare at it for several minutes in sheer astonishment. Kudos to NASA for making these images available.

September 10, 2007

Blogging

Filed under: utils — rcjp @ 6:13 pm

I’ve been trying to decide between blogging from blogger.com or wordpress.com; here are a few of the features that swung the decision:

  • bigger blogging window: I don’t like typing through a letterbox and thankfully wordpress has Options->Writing->’Size of the post box’; couldn’t see anything similar in blogger though
  • private entries: amazingly blogger only lets you mark the whole blog as private, not individual entries like wordpress does
  • wordpress’s built in latex: its great to be able to type
    $latex \partial_{x}\alpha&s=2$

    and get \partial_{x}\alpha (the s=2 increases the size) see the faq entry

There were problems with wordpress though: I had to disable the wysiwyg editing of entries Users->Your Profile->’Use the visual rich editor when writing’ (at the top of the page) because switching between it and the code view kept messing things up; I prefer the code view anyway tbh.

The editor is a bit sluggish sometimes, pausing for something, but at least there are a few handy shorcuts – mark some text then press the keys:

Characters Code Lists
Bold: Alt+Shift+B Blockquote: Alt+Shift+Q Unordered List (ul): Alt+Shift+U
Italics: Alt+Shift+I Code: Alt+Shift+C Ordered List (ol): Alt+Shift +O
Link: Alt+Shift+A Read More: Alt+Shift+T List Item (li): Alt+Shift+L
Keyboard Shortcuts

Hopefully I’ll get around to coding something that’ll allow you to write in reStructuredText from within emacs and upload using the meta weblog api converting to html. All that is pretty easy I think, I’ve done parts of that process for my old blog; the hard part would be doing the reverse to allow you to edit existing posts parsed back to rst.

June 8, 2007

gtk containers

Filed under: c — rcjp @ 7:05 pm

The gtk glib library includes some standard container like linked lists…


/* Compile with cc doubly.c  `pkg-config --cflags --libs gtk+-2.0`  */

#include <glib.h>
#include <string.h>
#include <stdio.h>

void myfunc(gpointer data, gpointer user_data)
{
    printf("value %s\n", (char *)data);
}

int main()
{
    GList *list = NULL;
    GList *it = NULL;

    list = g_list_append(list, "one");
    list = g_list_append(list, "two");
    list = g_list_append(list, "three");

    for (it = list; it != NULL; it = it->next)
        printf("looping - %s\n", (char *)it->data);

    /* or alternatively  */

    g_list_foreach(list, (GFunc) myfunc, NULL);

    printf("length of list is %d\n", g_list_length(list ));
    return 0;
}

May 10, 2007

Sign Extending Unsigned Numbers

Filed under: c — rcjp @ 2:26 pm

Just playing about really, but I thought it would be worth re-affirming in my brain that you have to be careful using assembler instructions like CLW, CLQ etc. when dealing with unsigned numbers since they propagate the sign bit.:

    #include <stdio.h>
    void extend(unsigned char x)
    {
        int y;
        asm ("xor %%eax, %%eax;\n\t"      /* clear EAX */
             "movb %1, %%al;\n\t"         /* move the byte x into AL */
             "cbw;\n\t"
             "movl %%eax, %0;\n\t"
             :"=r"(y)                     /* output only */
             :"r"(x)                      /* input */
             :"%eax");                    /* clobbered regs */

        printf("x=%d, cbw extended=%d\n", x, y);
    }

    int main(void)
    {
        extend(127);
        extend(128);

        return 0;
    }

gives

    ~/c/tmp> ./a.out
    x=127, cbw extended=127
    x=128, cbw extended=65408

there is no binary flag for printf in C, but in Common Lisp:

    CL-USER> (format t "~20B~%" 128)
                10000000
    NIL
    CL-USER> (format t "~20B~%" 65408)
        1111111110000000
    NIL

Instead of doing it in inline C, I thought I’d try nasm + gcc

            extern  printf

            SECTION .data
    fmt:    db "cbw %d (previously %d)", 10, 0 

            SECTION .text
            global main        ; needed for gcc 
    main:
            push    ebp
            mov     ebp,esp

            xor     eax,eax
            mov     ax, 128
            push    eax
            cbw
            push    eax
            push    dword fmt
            call    printf

            add     esp, 12    ; pop 3 pushes
            mov     esp, ebp   ; clean stack frame
            pop     ebp
            mov     eax,0      ; no error return
            ret

building and running with

    ~/c/tmp> nasm -f elf nasm2.asm
    ~/c/tmp> gcc -o nasm2 nasm2.o
    ~/c/tmp> nasm2
    cbw 65408 (previously 128)

April 27, 2007

Finding open oflag Settings

Filed under: c, python — rcjp @ 11:03 am

In tracings you see the numerical value for flags like the oflag settings to the C open commands e.g.

    5901 open("/proc/asound/cards", 32768, 0666) = 7

to find out what they mean I just quickly did:

    In [36]: def oflag_to_string(oflag):
       ....:     for c in dir(os):
       ....:         if c.startswith('O_'):
       ....:             if eval('os.'+c) & oflag:
       ....:                 print c
       ....:
       ....:

    In [37]: oflag_to_string(32768)
    O_LARGEFILE

but afterwards realised I could have just done an strace since that nicely converts the values for you.

Producing Slides with Beamer

Filed under: utils — Tags: — rcjp @ 10:29 am

Next time I need to do some slides, I’m definitely going to use the latex beamer package. It seems really easy to use

    \documentclass{beamer}
    \usepackage[latin1]{inputenc}
    \usepackage{amssymb,amsmath}  % use mathematical symbols
    \usepackage{ccfonts, eulervm} % concrete fonts but with nicer upright maths
    \usetheme{Warsaw}

    \title{Title of my talk}
    \author{me}\institute{work}

    \begin{document}
    \begin{frame}
    \titlepage
    \end{frame}

    \begin{frame}
    I am the text located on the first content slide
    \begin{equation}
    x+y_2={1\over 2\pi}\int^\alpha_\infty \tan^2\beta = \Gamma\quad eqn1
    \end{equation}
    \end{frame}

    \begin{frame}
    And I am the text located in the second content slide
    \end{frame}
    \end{document}

I need to play with this some more. Some nice notes here

April 17, 2007

reStructured Text

Filed under: python, utils — rcjp @ 4:20 pm

rst is a very easy to use ascii markup language. With some python script or the rst2html tool you can quickly generate html. For example, the following html (between the horizontal lines) was generated by the program below and uses the python docutils module on the ascii testtext python variable (but would normally be a file read in from somewhere):


The Title

My first sentence and some more.

  • bullet one
  • two
    • sublists, just like bullet lists must be separated by blank lines
    • second sublist
  • three

and more paragraph text.

Referees: A.N. Other
Boo Boo

Some literal indented code:

program(1)
for i = 1, 10
    nothing
done

And some more text

Header 1 Header 2 Header 3
body row 1 column 2 column 3
body row 2 Cells may span columns.
body row 3 Cells may
span rows.
  • Cells
  • contain
  • blocks.
body row 4

Run with:

rst2html ttt2 > k.html

For more details look at docutils quickref. and also check the latest
developments validator and latex


and the python code…

#
# some example rst text manipulated with python below...
#
testtext="""
The Title
=========

My first sentence and some more.

- bullet one
- two 

  - sublists, just like bullet lists must be separated by blank lines
  - second sublist

- three

and more paragraph text.

:Referees:
    A.N. Other
    Boo Boo

Some literal indented code::

    program(1)
    for i = 1, 10
        nothing
    done

And some more text


+------------+------------+-----------+
| Header 1   | Header 2   | Header 3  |
+============+============+===========+
| body row 1 | column 2   | column 3  |
+------------+------------+-----------+
| body row 2 | Cells may span columns.|
+------------+------------+-----------+
| body row 3 | Cells may  | - Cells   |
+------------+ span rows. | - contain |
| body row 4 |            | - blocks. |
+------------+------------+-----------+

Run with::

    rst2html ttt2 > k.html

For more details look at docutils_ quickref. and also check the latest 
developments validator_ and latex_

.. _Python: http://www.python.org/
.. _docutils: http://docutils.sourceforge.net/docs/user/rst/quickref.html#bullet-lists
.. _validator: http://docutils.sourceforge.net/sandbox/dugui/
.. _latex: http://docutils.sourceforge.net/sandbox/latex_directive/
"""

from docutils.core import publish_string, publish_parts
#
# quickly dump out the html using my own stylesheet
#
output = open("test-rst.html", 'w')
customcss = {'embed_stylesheet':False, 'stylesheet_path':'include/mystyle.css'}
output.write(publish_string(testtext, writer_name = 'html',
                            settings_overrides=customcss))
output.close()
#
# just dump the body part, no stylesheet at all
#
output = open("test-rst-body.html", 'w')
parts = publish_parts(testtext, writer_name = 'html',
                      settings_overrides=dict(embed_stylesheet=False,
                                              stylesheet_path=None,))
output.write(parts['html_body'])
output.close()


Pulling out the body text is handy for generating html that you are inserting inside another document, like this blog entry for example.

April 12, 2007

Circular Halo

Filed under: physics — rcjp @ 12:53 pm

circular-halo1

I hardly noticed this without my Polaroid sunglasses on, as the colours were quite faint (and I’m fairly colour-blind), but the camera seemed to pick it out reasonably well; though its not as impressive as this one.

I tried to estimate the angular measurement of the halo and got my measurements wrong by quite a bit. From the sun to the start of the arc was the distance from my thumb to little finger tips with my hand held spread out at arms length. That’s 2 feet away from my eye and 8 inches across (I’ve measure that before and its easy to remember – all nice round numbers; no metric system centimeters here for estimating thank you – those units are too small.) This gives the halo an angular radius of about… (using a handy Common Lisp prompt)


    CL-USER> (* (/ 180 pi) (atan 8/24))
    18.434948654789338d0 

that’s nowhere near the 22 degrees its supposed to be. But of course, holding your arm up at an angle reduces the distance between your eye and your hand that I’d measured before with my arm out horizontally. Re-measuring at around 45 degrees the hand-eye distance is down to 20″ and directly overhead 16″:


    CL-USER> (* (/ 180 pi) (atan 8/20))
    21.801408877810637d0 

much better, phew.

« Newer PostsOlder Posts »

Blog at WordPress.com.