# HG changeset patch # User Jeff Hammel # Date 1308332377 25200 # Node ID 00266c7a7c3ccc4a4fa1ec9de9e6c9cf855dc5d4 # Parent 20dde2687cfbf41d8e7f38486961463a908da397 since there is only one module, dont bother with the whole directory thing diff -r 20dde2687cfb -r 00266c7a7c3c setup.py --- a/setup.py Thu Jun 16 17:59:29 2011 -0700 +++ b/setup.py Fri Jun 17 10:39:37 2011 -0700 @@ -6,15 +6,15 @@ setup(name='urlmatch', version=version, description="match urls systematically", - long_description="""\ -""", + long_description='', classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers keywords='url', author='Jeff Hammel', author_email='jhammel@mozilla.com', url='http://k0s.org/mozilla/hg/urlmatch', license='MPL', - packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), + py_modules=['urlmatch'], + packages=[], include_package_data=True, zip_safe=False, install_requires=[ diff -r 20dde2687cfb -r 00266c7a7c3c urlmatch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/urlmatch.py Fri Jun 17 10:39:37 2011 -0700 @@ -0,0 +1,96 @@ +import urlparse + +class UrlMatcher(object): + + def __init__(self, *urls): + match_order=('domain', 'scheme', 'path') + self.order = match_order + self.urls = {} + for url in urls: + self.add(url) + + def decompose(self, url): + + # break it down + (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) + urldict = {} + + # domain + netloc = netloc.split('.') + if len(netloc) == 1: + urldict['domain'] = netloc + else: + # assert a TLD + urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) + + # path + path = path.strip('/').split('/') + if path == ['']: + path = [] + urldict['path'] = path + + # scheme + urldict['scheme'] = scheme + + # could do others + + return urldict + + def add(self, url): + if url not in self.urls: + self.urls[url] = self.decompose(url) + + def diff(self, url1, url2): + + # decompose the urls if necessary + if isinstance(url1, basestring): + url1 = self.decompose(url) + if isinstance(url2, basestring): + url2 = self.decompose(url) + + # TODO: finish + raise NotImplementedError + + def match(self, url): + if '://' not in url: + # give a bogus scheme for urlparse. boo! + urldict = self.decompose('bogus://' + url) + urldict.pop('scheme') + else: + urldict = self.decompose(url) + + order = self.order + urls = set(self.urls.keys()) + for field in order: + value = urldict.get(field) + if not value: + # don't match trivial fields + continue + length = len(value) + deleted = set() + for key in list(urls)[:]: + compare_value = self.urls[key].get(field) + if not compare_value: + urls.discard(key) + continue + if isinstance(value, basestring) and value != compare_value: + urls.discard(key) + continue + if len(compare_value) < length: + urls.discard(key) + continue + if compare_value[:len(value)] != value: + urls.discard(key) + if not urls: + return [] + return urls + +if __name__ == '__main__': + matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') + matcher.add('http://www.example.com/foo/blah') + matcher.add('https://www.example.com/foo/') + matcher.add('https://www.example.net/foo/') + print matcher.match('example.com/foo/bar') + print matcher.match('http://example.com/foo') + print matcher.match('example.com') + print matcher.match('example') diff -r 20dde2687cfb -r 00266c7a7c3c urlmatch/__init__.py --- a/urlmatch/__init__.py Thu Jun 16 17:59:29 2011 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -import urlparse - -class UrlMatcher(object): - - def __init__(self, *urls): - match_order=('domain', 'scheme', 'path') - self.order = match_order - self.urls = {} - for url in urls: - self.add(url) - - def decompose(self, url): - - # break it down - (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) - urldict = {} - - # domain - netloc = netloc.split('.') - if len(netloc) == 1: - urldict['domain'] = netloc - else: - # assert a TLD - urldict['domain'] = [netloc[-2], netloc[-1]] + list(reversed(netloc[0:-2])) - - # path - path = path.strip('/').split('/') - if path == ['']: - path = [] - urldict['path'] = path - - # scheme - urldict['scheme'] = scheme - - # could do others - - return urldict - - def add(self, url): - if url not in self.urls: - self.urls[url] = self.decompose(url) - - def diff(self, url1, url2): - - # decompose the urls if necessary - if isinstance(url1, basestring): - url1 = self.decompose(url) - if isinstance(url2, basestring): - url2 = self.decompose(url) - - # TODO: finish - raise NotImplementedError - - def match(self, url): - if '://' not in url: - # give a bogus scheme for urlparse. boo! - urldict = self.decompose('bogus://' + url) - urldict.pop('scheme') - else: - urldict = self.decompose(url) - - order = self.order - urls = set(self.urls.keys()) - for field in order: - value = urldict.get(field) - if not value: - # don't match trivial fields - continue - length = len(value) - deleted = set() - for key in list(urls)[:]: - compare_value = self.urls[key].get(field) - if not compare_value: - urls.discard(key) - continue - if isinstance(value, basestring) and value != compare_value: - urls.discard(key) - continue - if len(compare_value) < length: - urls.discard(key) - continue - if compare_value[:len(value)] != value: - urls.discard(key) - if not urls: - return [] - return urls - -if __name__ == '__main__': - matcher = UrlMatcher('http://www.example.com/foo/bar/fleem') - matcher.add('http://www.example.com/foo/blah') - matcher.add('https://www.example.com/foo/') - matcher.add('https://www.example.net/foo/') - print matcher.match('example.com/foo/bar') - print matcher.match('http://example.com/foo') - print matcher.match('example.com') - print matcher.match('example')