
    Rh$                         d Z ddlZddlZddlZdgZ ej                  dd      Z G d d      Z G d d      Z	 G d	 d
      Z
y)a%   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
    NRobotFileParserRequestRatezrequests secondsc                   Z    e Zd ZdZddZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zy)r   zs This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    c                 z    g | _         g | _        d | _        d| _        d| _        | j                  |       d| _        y )NFr   )entriessitemapsdefault_entrydisallow_all	allow_allset_urllast_checkedselfurls     )/usr/lib/python3.12/urllib/robotparser.py__init__zRobotFileParser.__init__   s;    !!S    c                     | j                   S )zReturns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        )r   r   s    r   mtimezRobotFileParser.mtime%   s        r   c                 6    ddl }|j                         | _        y)zYSets the time the robots.txt file was last fetched to the
        current time.

        r   N)timer   )r   r   s     r   modifiedzRobotFileParser.modified.   s    
 	 IIKr   c                 p    || _         t        j                  j                  |      dd \  | _        | _        y)z,Sets the URL referring to a robots.txt file.      N)r   urllibparseurlparsehostpathr   s     r   r   zRobotFileParser.set_url6   s-    %||44S9!A>	49r   c                    	 t         j                  j                  | j                        }|j	                         }| j                  |j                  d      j                                y# t         j                  j                  $ rT}|j                  dv rd| _        n4|j                  dk\  r |j                  dk  rd| _        Y d}~yY d}~yY d}~yY d}~yd}~ww xY w)z4Reads the robots.txt URL and feeds it to the parser.zutf-8)i  i  Ti  i  N)r   requesturlopenr   readr   decode
splitlineserror	HTTPErrorcoder
   r   )r   frawerrs       r   r%   zRobotFileParser.read;   s    		9&&txx0A &&(CJJszz'*5578 ||%% 	&xx:%$(!SSXX^!% &4 "	&s   )A* *C;CCc                     d|j                   v r| j                  || _        y y | j                  j                  |       y N*)
useragentsr	   r   append)r   entrys     r   
_add_entryzRobotFileParser._add_entryH   s=    %"""!!)%*" * LL&r   c                    d}t               }| j                          |D ]  }|s4|dk(  rt               }d}n"|dk(  r| j                  |       t               }d}|j                  d      }|dk\  r|d| }|j	                         }|sh|j                  dd      }t        |      dk(  s|d   j	                         j                         |d<   t        j                  j                  |d   j	                               |d<   |d   dk(  rB|dk(  r| j                  |       t               }|j                  j                  |d          d}*|d   dk(  r3|dk7  s9|j                  j                  t        |d   d	             d}e|d   d
k(  r3|dk7  st|j                  j                  t        |d   d             d}|d   dk(  r?|dk7  s|d   j	                         j                         rt!        |d         |_        d}|d   dk(  r|dk7  s|d   j                  d      }t        |      dk(  rk|d   j	                         j                         rJ|d   j	                         j                         r)t%        t!        |d         t!        |d               |_        d}|d   dk(  s| j(                  j                  |d           |dk(  r| j                  |       yy)zParse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        r   r      #N:z
user-agentdisallowFallowTzcrawl-delayzrequest-rate/sitemap)Entryr   r4   findstripsplitlenlowerr   r   unquoter1   r2   	rulelinesRuleLineisdigitintdelayr   req_rater   )r   linesstater3   lineinumberss          r   r   zRobotFileParser.parseQ   s     7	2DA:!GEEaZOOE*!GEE		#AAvBQx::<D::c1%D4yA~q'--///1Q ,,..tAw}}?Q7l*z. %$$++DG4E!W
*z..xQ/GH !!W'z..xQ/FG !!W-z  7==?224*-d1g,EK !!W.z"&q'--"4LA-'!*2B2B2D2L2L2N '
 0 0 2 : : <-8WQZ#gVWj/-ZEN !!W	)
 MM((a1o7	2p A:OOE" r   c                 b   | j                   ry| j                  ry| j                  syt        j                  j                  t        j                  j                  |            }t        j                  j                  dd|j                  |j                  |j                  |j                  f      }t        j                  j                  |      }|sd}| j                  D ]&  }|j                  |      s|j                  |      c S  | j                   r| j                   j                  |      S y)z=using the parsed robots.txt decide if useragent can fetch urlFT r;   )r
   r   r   r   r   r   rC   
urlunparser!   paramsqueryfragmentquoter   
applies_to	allowancer	   )r   	useragentr   
parsed_urlr3   s        r   	can_fetchzRobotFileParser.can_fetch   s    >>
    \\**6<<+?+?+DE
ll%%r"Z__j..
0C0C'E Fll  %C\\ 	,E	*s++	, %%//44r   c                     | j                         sy | j                  D ]!  }|j                  |      s|j                  c S  | j                  r| j                  j                  S y N)r   r   rV   rH   r	   r   rX   r3   s      r   crawl_delayzRobotFileParser.crawl_delay   sY    zz|\\ 	#E	*{{"	# %%+++r   c                     | j                         sy | j                  D ]!  }|j                  |      s|j                  c S  | j                  r| j                  j                  S y r\   )r   r   rV   rI   r	   r]   s      r   request_ratezRobotFileParser.request_rate   sY    zz|\\ 	&E	*~~%	& %%...r   c                 4    | j                   sy | j                   S r\   )r   r   s    r   	site_mapszRobotFileParser.site_maps   s    }}}}r   c                     | j                   }| j                  || j                  gz   }dj                  t        t        |            S )Nz

)r   r	   joinmapstr)r   r   s     r   __str__zRobotFileParser.__str__   s@    ,,)!3!3 44G{{3sG,--r   N)rP   )__name__
__module____qualname____doc__r   r   r   r   r%   r4   r   rZ   r^   r`   rb   rg    r   r   r   r      sE    
!(?
9'G#R:
.r   c                   "    e Zd ZdZd Zd Zd Zy)rE   zoA rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c                     |dk(  r|sd}t         j                  j                  t         j                  j                  |            }t         j                  j	                  |      | _        || _        y )NrP   T)r   r   rQ   r   rU   r!   rW   )r   r!   rW   s      r   r   zRuleLine.__init__   sP    2:iI||&&v||'<'<T'BCLL&&t,	"r   c                 Z    | j                   dk(  xs |j                  | j                         S r/   )r!   
startswith)r   filenames     r   rV   zRuleLine.applies_to   s%    yyCA8#6#6tyy#AAr   c                 B    | j                   rdnddz   | j                  z   S )NAllowDisallowz: )rW   r!   r   s    r   rg   zRuleLine.__str__   s    >>zTADIIMMr   N)rh   ri   rj   rk   r   rV   rg   rl   r   r   rE   rE      s    1#BNr   rE   c                   (    e Zd ZdZd Zd Zd Zd Zy)r=   z?An entry has one or more user-agents and zero or more rulelinesc                 <    g | _         g | _        d | _        d | _        y r\   )r1   rD   rH   rI   r   s    r   r   zEntry.__init__   s    
r   c                    g }| j                   D ]  }|j                  d|         | j                  |j                  d| j                          | j                  7| j                  }|j                  d|j                   d|j
                          |j                  t        t        | j                               dj                  |      S )NzUser-agent: zCrawl-delay: zRequest-rate: r;   
)r1   r2   rH   rI   requestssecondsextendre   rf   rD   rd   )r   retagentrates       r   rg   zEntry.__str__   s    __ 	/EJJeW-.	/::!JJtzzl34==$==DJJa~FG

3sDNN+,yy~r   c                     |j                  d      d   j                         }| j                  D ]  }|dk(  r y|j                         }||v s y y)z2check if this entry applies to the specified agentr;   r   r0   TF)r@   rB   r1   )r   rX   r}   s      r   rV   zEntry.applies_to   sX     OOC(+113	__ 	E|KKME	!	 r   c                 d    | j                   D ]!  }|j                  |      s|j                  c S  y)zZPreconditions:
        - our agent applies to this entry
        - filename is URL decodedT)rD   rV   rW   )r   rq   rL   s      r   rW   zEntry.allowance
  s2     NN 	&Dx(~~%	& r   N)rh   ri   rj   rk   r   rg   rV   rW   rl   r   r   r=   r=      s    I
r   r=   )rk   collectionsurllib.parser   urllib.request__all__
namedtupler   r   rE   r=   rl   r   r   <module>r      sU   
   
$k$$]4FG~. ~.BN N$( (r   