From a35894670047601556dcf4d59bb7ce780260edfb Mon Sep 17 00:00:00 2001 From: Matthias-Christian Ott Date: Thu, 30 Dec 2021 23:18:47 +0100 Subject: [PATCH] Use socket.getaddrinfo instead of socket.gethostbyname socket.gethostbyname only returns a single IPv4 address. Any additional IPv4 addresses and all IPv6 addresses are ignored. As a consequence, hosts without IPv4 addresses are ignored and any additional information from additional IPv4 addresses is ignored. If IPv4 addresses are rotated by the implementation of socket.gethostbyname, for example, common with DNS servers and resolvers, the geolocation information for the host can change without every call of socket.gethostbyname. It seems better to consider all addresses, both all IPv4 and IPv6 addresses, and to summarize the geolocation information when only a single result is required. Signed-off-by: Matthias-Christian Ott --- utility/mm2_crawler | 43 +++++++++++++++++++++++++++--- utility/mm2_generate-worldmap | 50 +++++++++++++++++++++++++++++++++-- 2 files changed, 87 insertions(+), 6 deletions(-) diff --git a/utility/mm2_crawler b/utility/mm2_crawler index b26c8a434..4bc338819 100755 --- a/utility/mm2_crawler +++ b/utility/mm2_crawler @@ -20,6 +20,7 @@ import urllib2 import urlparse import gc import geoip2.database +import collections sys.path.append('..') import mirrormanager2.lib @@ -1196,14 +1197,48 @@ def check_continent(categoryUrl): hostname = hostname.split(':')[0] try: - hostname = socket.gethostbyname(hostname) - except: + addrinfo = socket.getaddrinfo(hostname, None) + except socket.gaierror: # Name resolution failed. Returning '5' as this means # that the base URL is broken. return 5 - country = gi.country(hostname).country.iso_code - if not country: + # Extract the IPv4 and IPv6 address from the tuples returned by getaddrinfo. + addresses = set() + for family, addrtype, proto, canonname, sockaddr in addrinfo: + # The GeoIP2 databases contain only information for IPv4 and IPv6 + # addresses. Therefore, other, unusual address families are ignored. + if family == socket.AF_INET: + address, port = sockaddr + addresses.add(address) + elif family == socket.AF_INET6: + address, port, flowinfo, scope_id = sockaddr + addresses.add(address) + # Retrieve the ISO 3166-1 code for each address. + countries = [] + for address in addresses: + try: + country = gi.country(address) + except geoip2.errors.AddressNotFoundError: + # If no country object is found for an IPv4 or IPv6 address, + # the address is ignored. + pass + else: + iso_code = country.country.iso_code + # If the ISO 3166-1 code is not available, the country cannot be + # matched to continent. Therefore, the country object is ignored. + if iso_code is not None: + countries.append(iso_code) + # The GeoIP2 databases are not perfect and fully accurate. Therefore, + # multiple countries might be returned for hosts with multiple addresses. It + # seems best to use the most frequently occuring country if a host has + # multiple addresses. + country_counter = collections.Counter(countries) + if country_counter: + # most_common(1) returns a list with one element that is tuple that + # consists of the item and its count. + country = country_counter.most_common(1)[0][0] + else: # For hosts with no country in the GeoIP database # the default is 'US' as that is where most of # Fedora infrastructure systems are running diff --git a/utility/mm2_generate-worldmap b/utility/mm2_generate-worldmap index 6a44155fc..20392ede8 100755 --- a/utility/mm2_generate-worldmap +++ b/utility/mm2_generate-worldmap @@ -23,6 +23,7 @@ from pylab import * import urlparse import codecs import socket +import collections from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas @@ -54,8 +55,53 @@ def lookup_host_locations(config, gi): if hn in tracking: continue try: - ip = socket.gethostbyname(hn) - gir = gi.city(ip) + addrinfo = socket.getaddrinfo(hn, None) + # Extract the IPv4 and IPv6 address from the tuples returned by + # getaddrinfo. + addresses = set() + for family, addrtype, proto, canonname, sockaddr in addrinfo: + # The GeoIP2 databases contain only information for IPv4 and + # IPv6 addresses. Therefore, other, unusual address families are + # ignored. + if family == socket.AF_INET: + address, port = sockaddr + addresses.add(address) + elif family == socket.AF_INET6: + address, port, flowinfo, scope_id = sockaddr + addresses.add(address) + # Retrieve the city object for each address. + cities = [] + for address in addresses: + try: + city = gi.city(address) + except geoip2.errors.AddressNotFoundError: + # If no city object was found for an IPv4 or IPv6 address, + # the address is ignored. + pass + else: + # It seems that an empty city record is returned when no + # city was found. If no city has been found for an IPv4 or + # IPv6 address, the address is ignored. + if city.city.name is not None: + cities.append(city) + # If no city objects were found, the location of a host cannot be + # determined. + if not cities: + continue + city_names = (city.city.name for city in cities) + # Only the GeoIP2 Enterprise database has a confidence score for + # each city record. Therefore, it seems best to use the most + # frequently occuring city if a host has multiple addresses. + city_name_counter = collections.Counter(city_names) + # most_common(1) returns a list with one element that is tuple that + # consists of the item and its count. + most_common_city_name = city_name_counter.most_common(1)[0][0] + # Find a city object for the most common city name. Any city object + # should equivalent for a given city name. + for city in cities: + if most_common_city_name == city.city.name: + gir = city + break except: continue try: