libr/diff/bdiff.c
author pancake@pair
Sat, 04 Feb 2012 03:51:22 +0100
changeset 2007 ef840cc3d292
parent 619 e9e25f49d3fb
permissions -rw-r--r--
* Initial support for the z80 CPU
- assembler, disassembler and basic code analysis
- code analysis is very primitive atm
     1 /* radare - LGPL - Copyright 2009-2010 pancake<nopcode.org> */
     2 /* Adapted code from:
     3 
     4  bdiff.c - efficient binary diff extension for Mercurial
     5 
     6  Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
     7 
     8  This software may be used and distributed according to the terms of
     9  the GNU General Public License, incorporated herein by reference.
    10 
    11  Based roughly on Python difflib
    12 */
    13 
    14 #include <r_util.h>
    15 #include <r_diff.h>
    16 
    17 #include <stdlib.h>
    18 #include <string.h>
    19 #include <limits.h>
    20 
    21 struct line {
    22 	int h, len, n, e;
    23 	const char *l;
    24 };
    25 
    26 struct pos {
    27 	int pos, len;
    28 };
    29 
    30 struct hunk {
    31 	int a1, a2, b1, b2;
    32 };
    33 
    34 struct hunklist {
    35 	struct hunk *base, *head;
    36 };
    37 
    38 static int splitlines(const char *a, int len, struct line **lr)
    39 {
    40 	int h, i;
    41 	const char *p, *b = a;
    42 	const char * const plast = a + len - 1;
    43 	struct line *l;
    44 
    45 	if (a == NULL) {
    46 		eprintf ("null pointer received\n");
    47 		return 0;
    48 	}
    49 
    50 	/* count the lines */
    51 	i = 1; /* extra line for sentinel */
    52 	for (p = a; p < a + len; p++)
    53 		if (*p == '\n' || p == plast)
    54 			i++;
    55 
    56 	*lr = l = (struct line *)malloc(sizeof(struct line) * i);
    57 	if (!l)
    58 		return -1;
    59 
    60 	/* build the line array and calculate hashes */
    61 	h = 0;
    62 	for (p = a; p < a + len; p++) {
    63 		/* Leonid Yuriev's hash */
    64 		h = (h * 1664525) + *p + 1013904223;
    65 
    66 		if (*p == '\n' || p == plast) {
    67 			l->h = h;
    68 			h = 0;
    69 			l->len = p - b + 1;
    70 			l->l = b;
    71 			l->n = INT_MAX;
    72 			l++;
    73 			b = p + 1;
    74 		}
    75 	}
    76 
    77 	/* set up a sentinel */
    78 	l->h = l->len = 0;
    79 	l->l = a + len;
    80 	return i - 1;
    81 }
    82 
    83 static int inline cmp(struct line *a, struct line *b)
    84 {
    85 	return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);
    86 }
    87 
    88 static int equatelines(struct line *a, int an, struct line *b, int bn)
    89 {
    90 	int i, j, buckets = 1, t, scale;
    91 	struct pos *h = NULL;
    92 
    93 	/* build a hash table of the next highest power of 2 */
    94 	while (buckets < bn + 1)
    95 		buckets *= 2;
    96 
    97 	/* try to allocate a large hash table to avoid collisions */
    98 	for (scale = 4; scale; scale /= 2) {
    99 		h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
   100 		if (h)
   101 			break;
   102 	}
   103 
   104 	if (!h)
   105 		return 0;
   106 
   107 	buckets = buckets * scale - 1;
   108 
   109 	/* clear the hash table */
   110 	for (i = 0; i <= buckets; i++) {
   111 		h[i].pos = INT_MAX;
   112 		h[i].len = 0;
   113 	}
   114 
   115 	/* add lines to the hash table chains */
   116 	for (i = bn - 1; i >= 0; i--) {
   117 		/* find the equivalence class */
   118 		for (j = b[i].h & buckets; h[j].pos != INT_MAX;
   119 		     j = (j + 1) & buckets)
   120 			if (!cmp(b + i, b + h[j].pos))
   121 				break;
   122 
   123 		/* add to the head of the equivalence class */
   124 		b[i].n = h[j].pos;
   125 		b[i].e = j;
   126 		h[j].pos = i;
   127 		h[j].len++; /* keep track of popularity */
   128 	}
   129 
   130 	/* compute popularity threshold */
   131 	t = (bn >= 4000) ? bn / 1000 : bn + 1;
   132 
   133 	/* match items in a to their equivalence class in b */
   134 	for (i = 0; i < an; i++) {
   135 		/* find the equivalence class */
   136 		for (j = a[i].h & buckets; h[j].pos != INT_MAX;
   137 		     j = (j + 1) & buckets)
   138 			if (!cmp(a + i, b + h[j].pos))
   139 				break;
   140 
   141 		a[i].e = j; /* use equivalence class for quick compare */
   142 		if (h[j].len <= t)
   143 			a[i].n = h[j].pos; /* point to head of match list */
   144 		else
   145 			a[i].n = INT_MAX; /* too popular */
   146 	}
   147 
   148 	/* discard hash tables */
   149 	free(h);
   150 	return 1;
   151 }
   152 
   153 static int longest_match(struct line *a, struct line *b, struct pos *pos,
   154 			 int a1, int a2, int b1, int b2, int *omi, int *omj)
   155 {
   156 	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
   157 
   158 	for (i = a1; i < a2; i++) {
   159 		/* skip things before the current block */
   160 		for (j = a[i].n; j < b1; j = b[j].n)
   161 			;
   162 
   163 		/* loop through all lines match a[i] in b */
   164 		for (; j < b2; j = b[j].n) {
   165 			/* does this extend an earlier match? */
   166 			if (i > a1 && j > b1 && pos[j - 1].pos == i - 1)
   167 				k = pos[j - 1].len + 1;
   168 			else
   169 				k = 1;
   170 			pos[j].pos = i;
   171 			pos[j].len = k;
   172 
   173 			/* best match so far? */
   174 			if (k > mk) {
   175 				mi = i;
   176 				mj = j;
   177 				mk = k;
   178 			}
   179 		}
   180 	}
   181 
   182 	if (mk) {
   183 		mi = mi - mk + 1;
   184 		mj = mj - mk + 1;
   185 	}
   186 
   187 	/* expand match to include neighboring popular lines */
   188 	while (mi - mb > a1 && mj - mb > b1 &&
   189 	       a[mi - mb - 1].e == b[mj - mb - 1].e)
   190 		mb++;
   191 	while (mi + mk < a2 && mj + mk < b2 &&
   192 	       a[mi + mk].e == b[mj + mk].e)
   193 		mk++;
   194 
   195 	*omi = mi - mb;
   196 	*omj = mj - mb;
   197 
   198 	return mk + mb;
   199 }
   200 
   201 static void recurse(struct line *a, struct line *b, struct pos *pos,
   202 		    int a1, int a2, int b1, int b2, struct hunklist *l)
   203 {
   204 	int i, j, k;
   205 
   206 	/* find the longest match in this chunk */
   207 	k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
   208 	if (!k)
   209 		return;
   210 
   211 	/* and recurse on the remaining chunks on either side */
   212 	recurse(a, b, pos, a1, i, b1, j, l);
   213 	l->head->a1 = i;
   214 	l->head->a2 = i + k;
   215 	l->head->b1 = j;
   216 	l->head->b2 = j + k;
   217 	l->head++;
   218 	recurse(a, b, pos, i + k, a2, j + k, b2, l);
   219 }
   220 
   221 static struct hunklist diff(struct line *a, int an, struct line *b, int bn)
   222 {
   223 	struct hunklist l;
   224 	struct hunk *curr;
   225 	struct pos *pos;
   226 	int t;
   227 
   228 	/* allocate and fill arrays */
   229 	t = equatelines(a, an, b, bn);
   230 	pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
   231 	/* we can't have more matches than lines in the shorter file */
   232 	l.head = l.base = (struct hunk *)malloc(sizeof(struct hunk) *
   233 	                                        ((an<bn ? an:bn) + 1));
   234 
   235 	if (pos && l.base && t) {
   236 		/* generate the matching block list */
   237 		recurse(a, b, pos, 0, an, 0, bn, &l);
   238 		l.head->a1 = l.head->a2 = an;
   239 		l.head->b1 = l.head->b2 = bn;
   240 		l.head++;
   241 	}
   242 
   243 	free(pos);
   244 
   245 	/* normalize the hunk list, try to push each hunk towards the end */
   246 	for (curr = l.base; curr != l.head; curr++) {
   247 		struct hunk *next = curr+1;
   248 		int shift = 0;
   249 
   250 		if (next == l.head)
   251 			break;
   252 
   253 		if (curr->a2 == next->a1)
   254 			while (curr->a2+shift < an && curr->b2+shift < bn
   255 			       && !cmp(a+curr->a2+shift, b+curr->b2+shift))
   256 				shift++;
   257 		else if (curr->b2 == next->b1)
   258 			while (curr->b2+shift < bn && curr->a2+shift < an
   259 			       && !cmp(b+curr->b2+shift, a+curr->a2+shift))
   260 				shift++;
   261 		if (!shift)
   262 			continue;
   263 		curr->b2 += shift;
   264 		next->b1 += shift;
   265 		curr->a2 += shift;
   266 		next->a1 += shift;
   267 	}
   268 
   269 	return l;
   270 }
   271 
   272 //--
   273 // TODO: implement the r_diff_lines // we need to implement r_file_line_at (file, off);
   274 R_API int r_diff_buffers_delta(RDiff *d, const ut8 *sa, int la, const ut8 *sb, int lb) {
   275 	RDiffOp dop;
   276 	struct line *al, *bl;
   277 	struct hunklist l = { NULL, NULL };
   278 	struct hunk *h;
   279 	int an, bn, offa, rlen, offb, len = 0;
   280 	int hits = 0;
   281 
   282 	an = splitlines((const char *)sa, la, &al);
   283 	bn = splitlines((const char*)sb, lb, &bl);
   284 	if (!al || !bl) {
   285 		eprintf ("bindiff_buffers: Out of memory.\n");
   286 		return -1;
   287 	}
   288 
   289 	l = diff (al, an, bl, bn);
   290 	if (!l.head) {
   291 		eprintf ("bindiff_buffers: Out of memory.\n");
   292 		return -1;
   293 	}
   294 
   295 	la = lb = 0;
   296 	for (h = l.base; h != l.head; h++) {
   297 		if (h->a1 != la || h->b1 != lb) {
   298 			len = bl[h->b1].l - bl[lb].l;
   299 			offa = al[la].l - al->l;
   300 			offb = al[h->a1].l - al->l;
   301 			rlen = offb-offa;
   302 
   303 			if (d->callback) {
   304 				/* source file */
   305 				dop.a_off = offa;
   306 				dop.a_buf = (ut8 *)al[la].l;
   307 				dop.a_len = rlen;
   308 
   309 				/* destination file */
   310 				dop.b_off = offa; // XXX offb not used??
   311 				dop.b_buf = (ut8 *)bl[lb].l;
   312 				dop.b_len = len;
   313 				if (!d->callback (d, d->user, &dop))
   314 					break;
   315 			}
   316 #if 0	
   317 			if (rlen > 0) {
   318 				//printf ("Remove %d bytes at %d\n", rlen, offa);
   319 				printf ("r-%d @ 0x%"PFMT64x"\n", rlen, (ut64)offa);
   320 			}
   321 			printf ("e file.write=true\n"); // XXX
   322 			printf ("wx ");
   323 			for(i=0;i<len;i++)
   324 				printf ("%02x", bl[lb].l[i]);
   325 			printf (" @ 0x%"PFMT64x"\n", (ut64)offa);
   326 			rb += 12 + len;
   327 #endif
   328 		}
   329 		la = h->a2;
   330 		lb = h->b2;
   331 	}
   332 	free (al);
   333 	free (bl);
   334 	free (l.base);
   335 
   336 	return hits;
   337 }
   338