Rcjp's Weblog

October 1, 2007

Simple diff of 2 files

Filed under: c, lisp, python — rcjp @ 7:59 pm

A few days ago I wanted to compare two files each of which had one word per line (they were completion files for the rlwrap utility incidentally) thats easy enough with the unix shell commands, infact you can do it in one line

diff -iyw --suppress-common-lines <(sort -f file1) <(sort -f file2)

but I thought as a quick programming exercise I’d do it in C++/C, python and Common Lisp…

Calculating the difference is very easy using C++’s set_difference etc. but it is a surprise, I think for most programmers anyway, that you have to write your own case insensitive string comparison function. C++ sure is a peculiar mix of high and low level programming.

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <iterator>
#include <functional>

// 
// not sure there is any advantage to inherit from binary_function here?
// (infact we could just define a function rather than a struct and operator)
//
struct lessthan_nocase :
  public std::binary_function<const std::string&, const std::string&, bool>
{
    bool operator()(const std::string& s1, const std::string& s2) const
    {
        std::string::const_iterator p1 = s1.begin();
        std::string::const_iterator p2 = s2.begin();

        while(p1 != s1.end() && p2 != s2.end()) {
            if (toupper(*p1) != toupper(*p2)) return toupper(*p1) < toupper(*p2);
            ++p1;
            ++p2;
        }
        return s1.size() < s2.size();
    }
};

//
// set_difference only work on sorted containers
//
void sorted_readfile(const char* file, std::vector<std::string>& fvec)
{
    std::ifstream f(file);
    std::istream_iterator<std::string> finput(f), fend;

    copy(finput, fend, back_inserter(fvec));
    sort(fvec.begin(), fvec.end(), lessthan_nocase());
}

//
// dump results (show words if < MAXSHOW)
//
const unsigned int MAXSHOW = 10;

void display_diff(std::string title, std::vector<std::string> v)
{
    std::cout << title;
    if (v.size() < MAXSHOW) {
        std::cout << std::endl << std::string(title.size(), '-') << std::endl;
        copy(v.begin(), v.end(),
             std::ostream_iterator<std::string>(std::cout, "\n"));
    } else {
        std::cout << " =  " << v.size() << " words" << std::endl;
    }
    std::cout << std::endl;
}

int main(int argc, char* argv[])
{
    if (argc != 3) {
        std::cerr << "Usage: tdiff filename1 filename2" << std::endl;
        exit(1);
    }

    std::vector<std::string> f1;
    sorted_readfile(argv[1], f1);

    std::vector<std::string> f2;
    sorted_readfile(argv[2], f2);

    std::vector<std::string> notinf1;
    set_difference(f1.begin(), f1.end(), f2.begin(), f2.end(),
            back_inserter(notinf1), lessthan_nocase());
    display_diff("words in 1st but not in 2nd", notinf1);

    std::vector<std::string> notinf2;
    set_difference(f2.begin(), f2.end(), f1.begin(), f1.end(),
            back_inserter(notinf2), lessthan_nocase());
    display_diff("words in 2nd but not in 1st", notinf2);

    std::vector<std::string> symdiff;
    set_symmetric_difference(f2.begin(), f2.end(), f1.begin(), f1.end(),
            back_inserter(symdiff), lessthan_nocase());
    display_diff("symmetric difference", symdiff);

    std::vector<std::string> inter;
    set_intersection(f2.begin(), f2.end(), f1.begin(), f1.end(),
            back_inserter(inter), lessthan_nocase());
    display_diff("intersection", inter);

    return 0;
}

In C, the only thing to trip you up is getting the casting of the void * pointers in strcmp_nocase correct before trying to dereference them. Also I’ve cheated and used the non-ansi strcasecmp (sometimes called stricmp).

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXLINE  256
#define MAXWORDS 10000
#define MAXSHOW  10

typedef enum {false, true} bool;

bool strempty(char *str)
{
    char *p = str;
    while (*p) if (!isspace(*p++)) return false;
    return true;
}

int strcmp_nocase(const void *s1, const void *s2)
{
    /* The actual arguments to this function are "pointers to
       pointers to char", but strcmp() arguments are "pointers
       to char", hence the following cast plus dereference */
    return strcasecmp( *(char * const *)s1, * (char * const *) s2);
}

int sorted_readfile(char *filename, char **words)
{
    FILE *f = fopen(filename, "r");

    if (!f) {
        fprintf(stderr, "can't open %s\n", filename);
        exit(1);
    }

    int count = 0;
    char line[MAXLINE], *q;
    while (fgets(line, MAXLINE, f) && (count < MAXWORDS)) {
        if ((q=strpbrk(line,"\r\n"))) *q=0;     /* words dont include newlines */
        if (strempty(line)) continue;           /* skip blank lines */
        words[count] = (char *) malloc(strlen(line)+1);
        /* or words[count++] = strdup(line)*/
        strcpy(words[count], line);
        count++;
    }

    fclose(f);

    if (count == MAXWORDS) {
        fprintf(stderr, "increase MAXWORDS buffer size\n");
        exit(1);
    }
    qsort(words, count, sizeof(char *), strcmp_nocase);
    return count;
}

void display_diff(char *title, char **v, int vsize)
{
    int i;
    printf("%s", title);

    if (vsize < MAXSHOW) {
        printf("\n");
        for (i=0; i<strlen(title); i++) putchar('-');
        printf("\n");
        for (i=0; i<vsize; i++)
            printf("%s\n", v[i]);
    } else {
        printf(" = %d words\n", vsize);
    }
    printf("\n");
}

int main(int argc, char *argv[])
{
    if (argc != 3) {
        fprintf(stderr, "Usage: tdiff filename1 filename2\n");
        exit(1);
    }
    int i;
    char **f1words;
    f1words = malloc(MAXWORDS*sizeof(char*));
    int nf1words = sorted_readfile(argv[1], f1words);

    char **f2words;
    f2words = malloc(MAXWORDS*sizeof(char*));
    int nf2words = sorted_readfile(argv[2], f2words);

    char **diff;
    int ndiff = 0;
    diff = malloc(MAXWORDS*sizeof(char*));
    for (i=0; i < nf1words; i++) {
        if (!bsearch(&f1words[i], f2words, nf2words,
                     sizeof(char*), strcmp_nocase)) {
            /* could just point into f1words instead */
            diff[ndiff++] = strdup(f1words[i]);
        }
    }
    display_diff("words in 1st but not in 2nd", diff, ndiff);

    return 0;
}

In python, most of it is easy, though its perhaps not very pythonesque to cram some things on one line like I’ve done here. Making set case independent requires the __hash__ and __eq__ functions (I think that is all we need in this case) get overridden to use a saved lowercase version of the supplied string (see the python reference) .

import string

class NoCaseStr(str):
    def __init__(self, s):
        str.__init__(self, s)
        # keep a copy of the lower case string
        self.loweredstr = s.lower()

    def __eq__(self, s):
        return self.loweredstr == s.lower()

    def __hash__(self):
        return hash(self.loweredstr)

def display_diff(title, dset, Maxshow=10):
    if len(dset) < Maxshow:
        print title
        print len(title) * '-'
        print '\n'.join(dset)
    else:
        print title, '=', len(dset)
    print

def read_words(f):
    w = set(NoCaseStr(string.strip(word)) for word in open(f).readlines())
    w.discard('')# slightly clumsy... '  \n' gets stripped to '' so discard it
    return w

def tdiff(f1, f2):
    w1 = read_words(f1)
    w2 = read_words(f2)
    display_diff('in 1st but not in 2nd', w1.difference(w2))
    display_diff('in 2nd but not in 1st', w2.difference(w1))
    display_diff('symmetric difference',  w1.symmetric_difference(w2))
    display_diff('intersection',          w1.intersection(w2))


if __name__ == '__main__':
    import sys
    if len(sys.argv) < 3:
        print 'Usage: %s file1 file2' % sys.argv[0]
        sys.exit(1)

    tdiff(sys.argv[1], sys.argv[2])

My favourite language – Common Lisp has everything you need to do with without overriding anything. string-equal is case insensitive (as opposed to string=)

(defun read-words (file)
  (with-open-file (str file)
    (loop for line = (read-line str nil nil)
       for word = (string-trim " " line)
       while line
       unless (string= word "")
       collect word)))

(defun display-diff (title diff &optional (max-show 10))
  (format t "~A: ~V{~A~^ ~}~%" title max-show diff))

(defun tdiff (f1 f2)
  (let ((w1 (read-words f1))
        (w2 (read-words f2)))
    (display-diff "in 1st but not in 2nd" (set-difference w1 w2 :test #'string-equal))
    (display-diff "in 2nd but not in 1st" (set-difference w2 w1 :test #'string-equal))
    (display-diff "intersection" (intersection w1 w2 :test #'string-equal))))

(tdiff "/home/r/tmp/f1" "/home/r/tmp/f2")

Advertisements

Leave a Comment »

No comments yet.

RSS feed for comments on this post.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Blog at WordPress.com.

%d bloggers like this: